In [65]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn import preprocessing
from sklearn.cross_validation import cross_val_score

from sklearn.linear_model import LogisticRegression



### Check initial metrics for 5 popular modeling techniques using obtained features

In [34]:
selected_features = ["race", "gender", "age2", "admission_type_id2", "discharge_disposition_id2", "admission_source_id2", "time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications", "number_outpatient_log", "number_emergency_log", "number_inpatient", "number_diagnoses", "max_glu_serum", "A1Cresult", "insulin", "Diabetes Med (Up)", "Diabetes Med (Steady)", "Diabetes Med (Down)", "change", "diabetesMed"]
target = "readmitted2"

In [45]:
training_df = pd.read_csv("../modeling_data/diabetes_data_v4_p1_70pct.csv", header=0)
training_X = training_df[selected_features]
training_Y = training_df[target]

validation_df = pd.read_csv("../modeling_data/diabetes_data_v4_p2_15pct.csv", header=0)
validation_X = validation_df[selected_features]
validation_Y = validation_df[target]

training_X.head()

Unnamed: 0,race,gender,age2,admission_type_id2,discharge_disposition_id2,admission_source_id2,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,number_inpatient,number_diagnoses,max_glu_serum,A1Cresult,insulin,Diabetes Med (Up),Diabetes Med (Steady),Diabetes Med (Down),change,diabetesMed
0,Caucasian,Female,<30,1,1,3,2,33,0,4,...,0,2,,>8,Steady,0,1,0,No,Yes
1,Caucasian,Male,>60,3,1,1,2,33,0,16,...,1,9,,,Steady,0,2,0,Ch,Yes
2,Caucasian,Female,>60,4,6,1,5,47,3,24,...,0,7,,,Steady,0,1,0,No,Yes
3,Caucasian,Female,>60,2,1,1,5,32,1,16,...,0,9,,Norm,Steady,0,1,0,No,Yes
4,Caucasian,Male,30 - 60,3,1,1,3,32,1,4,...,0,9,,>8,No,0,0,0,No,No


### Apply one-hot encoding

In [46]:
# one-hot-encoding on categorical features
# convert nominal values to dummy values
def one_hot_encode(df):
    df_age = pd.get_dummies(df['age2'])
    df_race = pd.get_dummies(df['race'])
    df_gender = pd.get_dummies(df['gender'])
    df_max_glu_serum = pd.get_dummies(df['max_glu_serum'])
    df_A1Cresult = pd.get_dummies(df['A1Cresult'])
    df_insulin = pd.get_dummies(df['insulin'])
    df_change = pd.get_dummies(df['change'])
    df_diabetesMed = pd.get_dummies(df['diabetesMed'])
    df_discharge_disposition_id = pd.get_dummies(df['discharge_disposition_id2'])
    df_admission_source_id = pd.get_dummies(df['admission_source_id2'])
    df_admission_type_id = pd.get_dummies(df['admission_type_id2'])

    df = pd.concat([df, df_age, df_race, df_gender, df_max_glu_serum, df_A1Cresult, 
                      df_insulin, df_change, df_diabetesMed, df_discharge_disposition_id, 
                      df_admission_source_id, df_admission_type_id], axis=1)
    df.drop(['age2', 'race', 'gender', 'max_glu_serum', 'A1Cresult', 'insulin', 'change', 
                      'diabetesMed', 'discharge_disposition_id2', 'admission_source_id2', 
                      'admission_type_id2'], axis=1, inplace=True)
    
    return df

In [47]:
def convert_readmitted(x):
    if x == "NO":
        return 0
    return 1

In [48]:
training_X = one_hot_encode(training_X)

In [49]:
training_X.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient_log,number_emergency_log,number_inpatient,number_diagnoses,Diabetes Med (Up),Diabetes Med (Steady),...,1,2,3,4,5,6,1.1,2.1,3.1,4.1
0,2,33,0,4,0.0,0.0,0,2,0,1,...,0,0,1,0,0,0,1,0,0,0
1,2,33,0,16,0.0,0.0,1,9,0,2,...,1,0,0,0,0,0,0,0,1,0
2,5,47,3,24,0.0,0.0,0,7,0,1,...,1,0,0,0,0,0,0,0,0,1
3,5,32,1,16,0.30103,0.0,0,9,0,1,...,1,0,0,0,0,0,0,1,0,0
4,3,32,1,4,0.0,0.0,0,9,0,0,...,1,0,0,0,0,0,0,0,1,0


In [50]:
training_X.columns

Index([     'time_in_hospital',    'num_lab_procedures',
              'num_procedures',       'num_medications',
       'number_outpatient_log',  'number_emergency_log',
            'number_inpatient',      'number_diagnoses',
           'Diabetes Med (Up)', 'Diabetes Med (Steady)',
         'Diabetes Med (Down)',               '30 - 60',
                         '<30',                   '>60',
             'AfricanAmerican',                 'Asian',
                   'Caucasian',              'Hispanic',
                       'Other',                'Female',
                        'Male',                  '>200',
                        '>300',                  'None',
                        'Norm',                    '>7',
                          '>8',                  'None',
                        'Norm',                  'Down',
                          'No',                'Steady',
                          'Up',                    'Ch',
                          'No',

In [51]:
validation_X = one_hot_encode(validation_X)

In [52]:
validation_X.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient_log,number_emergency_log,number_inpatient,number_diagnoses,Diabetes Med (Up),Diabetes Med (Steady),...,1,2,3,4,5,6,1.1,2.1,3.1,4.1
0,6,58,3,12,0.0,0.0,0,9,0,0,...,0,0,1,0,0,0,1,0,0,0
1,2,42,0,13,0.0,0.0,0,9,1,0,...,1,0,0,0,0,0,0,0,1,0
2,7,74,0,25,0.0,0.30103,2,9,1,0,...,1,0,0,0,0,0,1,0,0,0
3,9,86,5,65,0.0,0.0,0,9,1,1,...,1,0,0,0,0,0,0,0,1,0
4,7,54,1,16,0.0,0.0,0,5,0,0,...,1,0,0,0,0,0,0,1,0,0


In [53]:
training_Y = training_Y.apply(convert_readmitted)
training_Y.head()

0    0
1    0
2    1
3    0
4    0
Name: readmitted2, dtype: int64

In [54]:
validation_Y = validation_Y.apply(convert_readmitted)
validation_Y.head()

0    1
1    1
2    1
3    0
4    0
Name: readmitted2, dtype: int64

### Normalize the numeric values

In [57]:
# feature scaling, features are standardized to have zero mean and unit variance
feature_scale_cols = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 
                      'number_diagnoses', 'number_inpatient', 'number_emergency_log', 'number_outpatient_log', 'Diabetes Med (Up)', 'Diabetes Med (Steady)', 'Diabetes Med (Down)']

def standardize_numeric_values(train_df, validation_df=None):
    # Fit on the training data
    scaler = preprocessing.StandardScaler().fit(train_df[feature_scale_cols])
    
    if validation_df is not None:
        transform_df = validation_df
    else:
        transform_df = train_df
    
    data_scaler = scaler.transform(transform_df[feature_scale_cols])
    data_scaler_df = pd.DataFrame(data=data_scaler, columns=feature_scale_cols, index=transform_df.index)
    transform_df.drop(feature_scale_cols, axis=1, inplace=True)
    transform_df = pd.concat([transform_df, data_scaler_df], axis=1)
    return transform_df



In [63]:
training_X = standardize_numeric_values(training_X)
training_X.head(10)

Unnamed: 0,30 - 60,<30,>60,AfricanAmerican,Asian,Caucasian,Hispanic,Other,Female,Male,...,num_lab_procedures,num_procedures,num_medications,number_diagnoses,number_inpatient,number_emergency_log,number_outpatient_log,Diabetes Med (Up),Diabetes Med (Steady),Diabetes Med (Down)
0,0,1,0,0,0,1,0,0,1,0,...,-0.508896,-0.784701,-1.480127,-2.788362,-0.504754,-0.323378,-0.40212,-0.392688,0.372151,-0.388683
1,0,0,1,0,0,1,0,0,0,1,...,-0.508896,-0.784701,0.002392,0.823263,0.28902,-0.323378,-0.40212,-0.392688,1.682817,-0.388683
2,0,0,1,0,0,1,0,0,1,0,...,0.203212,0.975363,0.990738,-0.20863,-0.504754,-0.323378,-0.40212,-0.392688,0.372151,-0.388683
3,0,0,1,0,0,1,0,0,1,0,...,-0.559761,-0.198013,0.002392,0.823263,-0.504754,-0.323378,1.206871,-0.392688,0.372151,-0.388683
4,1,0,0,0,0,1,0,0,0,1,...,-0.559761,-0.198013,-1.480127,0.823263,-0.504754,-0.323378,-0.40212,-0.392688,-0.938514,-0.388683
5,1,0,0,1,0,0,0,0,1,0,...,-0.051112,-0.784701,0.620109,0.823263,-0.504754,-0.323378,-0.40212,-0.392688,-0.938514,2.508684
6,0,0,1,0,0,1,0,0,0,1,...,-1.271868,-0.784701,0.002392,-0.20863,-0.504754,-0.323378,2.148071,-0.392688,1.682817,-0.388683
7,1,0,0,0,0,1,0,0,1,0,...,0.355806,0.975363,1.114282,0.823263,-0.504754,-0.323378,-0.40212,-0.392688,-0.938514,-0.388683
8,1,0,0,1,0,0,0,0,1,0,...,0.762725,1.562051,1.731998,-0.20863,-0.504754,-0.323378,-0.40212,-0.392688,1.682817,-0.388683
9,1,0,0,0,0,1,0,0,1,0,...,-0.305436,-0.198013,2.596801,0.823263,-0.504754,-0.323378,-0.40212,2.305761,0.372151,-0.388683


In [59]:
validation_X = standardize_numeric_values(training_X, validation_X)
validation_X[feature_scale_cols].head(10)

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_diagnoses,number_inpatient,number_emergency_log,number_outpatient_log,Diabetes Med (Up),Diabetes Med (Steady),Diabetes Med (Down)
0,6.0,58.0,3.0,12.0,9.0,4.7806860000000005e-17,8.658973e-17,-4.8465910000000005e-17,5.779408e-18,7.574066e-17,3.518341e-17
1,2.0,42.0,-2.382738e-17,13.0,9.0,4.7806860000000005e-17,8.658973e-17,-4.8465910000000005e-17,1.0,7.574066e-17,3.518341e-17
2,7.0,74.0,-2.382738e-17,25.0,9.0,2.0,0.30103,-4.8465910000000005e-17,1.0,7.574066e-17,3.518341e-17
3,9.0,86.0,5.0,65.0,9.0,4.7806860000000005e-17,8.658973e-17,-4.8465910000000005e-17,1.0,1.0,3.518341e-17
4,7.0,54.0,1.0,16.0,5.0,4.7806860000000005e-17,8.658973e-17,-4.8465910000000005e-17,5.779408e-18,7.574066e-17,3.518341e-17
5,12.0,62.0,-2.382738e-17,21.0,9.0,3.0,8.658973e-17,-4.8465910000000005e-17,5.779408e-18,1.0,3.518341e-17
6,5.0,31.0,2.0,25.0,8.0,4.7806860000000005e-17,8.658973e-17,-4.8465910000000005e-17,5.779408e-18,2.0,3.518341e-17
7,2.0,24.0,6.0,15.0,9.0,1.0,8.658973e-17,0.69897,5.779408e-18,7.574066e-17,3.518341e-17
8,3.0,20.0,3.0,30.0,6.0,4.7806860000000005e-17,8.658973e-17,-4.8465910000000005e-17,5.779408e-18,2.0,3.518341e-17
9,13.0,55.0,1.0,37.0,9.0,4.7806860000000005e-17,8.658973e-17,-4.8465910000000005e-17,5.779408e-18,7.574066e-17,2.0


## Modelling starts below

### Naive Bayes

In [83]:
# Naive Bayes# Naive  
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
NB_score = cross_val_score(nb, training_X, training_Y, cv=10, scoring='accuracy').mean()
NB_score

0.5098603431978159

In [84]:
nb.fit(training_X, training_Y)
y_pred_class = nb.predict(validation_X)
print(metrics.accuracy_score(validation_Y, y_pred_class))
#y_pred_class
pd.DataFrame(y_pred_class, columns=['predictions']).to_csv('nb.csv')

0.5416527935006992


### Random forest

In [85]:
# Random Forest# Random 
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
RF_score = cross_val_score(rf, training_X, training_Y, cv=10, scoring='accuracy').mean()
RF_score

0.5889151767918024

In [86]:
rf.fit(training_X, training_Y)
y_pred_class = rf.predict(validation_X)
print(metrics.accuracy_score(validation_Y, y_pred_class))
#y_pred_class
pd.DataFrame(y_pred_class, columns=['predictions']).to_csv('rf.csv')

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

### Logistic regression with cross validation

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
LR_score = cross_val_score(lr, training_X, training_Y, cv=10, scoring='accuracy').mean()
LR_score

In [None]:
from sklearn import metrics
lr.fit(training_X, training_Y)
y_pred_class = lr.predict(validation_X)
print(metrics.accuracy_score(validation_Y, y_pred_class))
#y_pred_class
pd.DataFrame(y_pred_class, columns=['predictions']).to_csv('lr.csv')

### Neural networks

In [77]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
nn.fit(training_X, training_Y) 
y_pred_class = nn.predict(validation_X)
print(metrics.accuracy_score(validation_Y, y_pred_class))
pd.DataFrame(y_pred_class, columns=['predictions']).to_csv('nn.csv')

0.5470466804288473
