In [1]:
import numpy as np
import pandas as pd


In [2]:
df_train = pd.read_csv("10k_diabetes/diab_train.csv",
                       na_values = ["?", "Not Available", "Not Mapped"])
df_test = pd.read_csv("10k_diabetes/diab_test.csv")
df_validate = pd.read_csv("10k_diabetes/diab_validation.csv")

In [3]:
print(df_train.shape)
print(df_train.dtypes)

(6000, 52)
Unnamed: 0                   int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id           object
discharge_disposition_id    object
admission_source_id         object
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexa

In [4]:
type_txt = ["discharge_disposition_id",
           "medical_specialty",
           "diag_1_desc",
           "diag_2_desc",
           "diag_3_desc"]

type_cat = ["race",
          "gender",
          "age",
          "weight",
          "admission_type_id",
        "admission_source_id",
        "payer_code",
          "max_glu_serum",
           "A1Cresult",
           "metformin",
           "repaglinide",
           "nateglinide",
           "chlorpropamide",
           "glimepiride",
           "acetohexamide",
           "glipizide",
           "glyburide",
           "tolbutamide",
           "pioglitazone",
           "rosiglitazone",
           "acarbose",
           "miglitol",
           "troglitazone",
           "tolazamide",
           "examide",
           "citoglipton",
           "insulin",
           "glyburide.metformin",
           "glipizide.metformin",
           "glimepiride.pioglitazone",
           "metformin.rosiglitazone",
           "metformin.pioglitazone",
           "change",
           "diabetesMed"]

type_le = ["age", "weight", "A1Cresult"]

type_int = ["time_in_hospital",
           "num_lab_procedures",
           "num_procedures",
           "num_medications",
           "number_outpatient",
           "number_emergency",
           "number_inpatient",
           "number_diagnoses"]

type_float = ["diag_1",
             "diag_2",
             "diag_3"]

In [5]:
#l = list()
#for columnName, _ in df_train.iteritems():
#    if(df_train[columnName].dtypes == np.object):
#        l.append(df_train[columnName].value_counts(dropna = False))

In [6]:
from sklearn.impute import SimpleImputer

def prep_data(df, impute=True, imp = "mean", nlp=False):
    y = df["readmitted"]
    df = df.drop(columns=['readmitted', 'Unnamed: 0'])
    
    if not nlp:
        df = df.drop(columns = type_txt)
    #Cast types
    #for i in type_cat:
    #    df[i] = df[i].astype('category')

    for i in type_int:
        #df_train[i] = df_train[i].astype('int32')
        df[i] = pd.to_numeric(df[i], errors='coerce', downcast='integer')

    for i in type_float:
        df[i] = pd.to_numeric(df[i], errors='coerce', downcast='float')
    
    #Get features that are categorical and create oh encoding
    ohe_mask = df.dtypes==object
    df_ohe = pd.get_dummies(df, prefix=df.columns[ohe_mask].tolist(),
               columns=df.columns[ohe_mask].tolist())
    
    #Impute missing values
    if impute:
        idx = pd.isnull(df_ohe).any().tolist()
        print("Impute values for the following attributes")
        print(df_ohe.columns[idx])

        df_imp = SimpleImputer(strategy=imp).fit_transform(df_ohe.loc[:,idx])
        df_ohe.loc[:,idx] = df_imp
        
    return df_ohe, y

In [7]:
X_train, y_train = prep_data(df_train)
X_test, y_test = prep_data(df_train)
X_val, y_val = prep_data(df_validate)

Impute values for the following attributes
Index(['diag_1', 'diag_2', 'diag_3'], dtype='object')
Impute values for the following attributes
Index(['diag_1', 'diag_2', 'diag_3'], dtype='object')
Impute values for the following attributes
Index(['diag_1', 'diag_2', 'diag_3'], dtype='object')


In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
calibrated_forest = CalibratedClassifierCV(base_estimator=clf)
param_grid = {'base_estimator__n_estimators': [10, 50, 100, 200], 
              'base_estimator__max_depth': [2, 6, 12, 24]}
search = GridSearchCV(calibrated_forest, param_grid, cv=5)

In [None]:
search.fit(X_train.values, y_train)
pred_test = clf.predict(X_test)



In [10]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

acc = accuracy_score(y_test, pred_test)
conf_mat = confusion_matrix(y_test, pred_test)

In [11]:
acc

0.9805

In [12]:
conf_mat

array([[3615,    6],
       [ 111, 2268]])