In [1]:
# import modules

import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

pd.set_option('display.max_columns', None)

In [2]:
# read source data

df = pd.read_csv(os.path.realpath('../data/diabetic_data.csv'))

admission_source_df = pd\
    .read_csv(os.path.realpath('../data/admission_source_id.csv'))\
    .rename(columns = {'description' : 'admission_source_description'})

admission_type_df = pd\
    .read_csv(os.path.realpath('../data/admission_type_id.csv'))\
    .rename(columns = {'description' : 'admission_type_description'})
    
discharge_disposition_df = pd\
    .read_csv(os.path.realpath('../data/discharge_disposition_id.csv'))\
    .rename(columns = {'description' : 'discharge_disposition_description'})

In [3]:
# remove the ID columns. note that there are multiple encounters for some patients
# however it is not clear how to model these, so assume the encounters are independent
# of the patient

df.drop(['encounter_id', 'patient_nbr'], axis=1, inplace=True)

In [4]:
# create merged data frame

df = pd.merge(df, admission_source_df, on='admission_source_id')
df = pd.merge(df, admission_type_df, on='admission_type_id')
df = pd.merge(df, discharge_disposition_df, on='discharge_disposition_id')

In [5]:
# remove old index columns

df.drop(['admission_source_id', 'admission_type_id', 'discharge_disposition_id'], axis=1, inplace=True)

In [6]:
# replace the ? + NaNs with empty strings

df.replace('?', '', inplace = True)
df.fillna('', inplace=True)

In [7]:
# categorize the non-numerical columns and convert to numbers
# note that a lot of these should be ordered, but that can be done later

category_columns = ["race", "gender", "age", "weight", "payer_code", "medical_specialty", 
                    # these are medical diagnosis codes, see:
                    # https://www.cms.gov/regulations-and-guidance/guidance/transmittals/downloads/r1996cp.pdf
                    "diag_1", "diag_2", "diag_3",
                    "max_glu_serum", "A1Cresult", "metformin", "repaglinide", "nateglinide", "chlorpropamide",
                    "glimepiride", "acetohexamide", "glipizide", "glyburide", "tolbutamide", "pioglitazone", 
                    "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide", "examide", 
                    "citoglipton", "insulin", "glyburide-metformin", "glipizide-metformin", 
                    "glimepiride-pioglitazone", "metformin-rosiglitazone", "metformin-pioglitazone", 
                    "change", "diabetesMed", "readmitted", "admission_source_description", 
                    "admission_type_description", "discharge_disposition_description"]

for column in category_columns:
    df[column] = df[column].astype('category')
    df[column] = df[column].cat.codes

In [8]:
# remove all of the columns where a single value comprises almost all of the values
# this gets rid of a lot of columns; it might discard some key information, but
# this can be put back later

rows = len(df.index)
threshold = 0.99995 * rows

for column in df.columns.values:
    if (df[column].value_counts().tolist()[0] >= threshold):
        df.drop([column], axis=1, inplace=True)
        
print("number of columns: {0}".format(len(df.columns)))

number of columns: 41


In [9]:
# possibly do a dimensional reduction here if we can work out how
# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

pca = PCA(n_components = "mle", svd_solver = "full")
fit = pca.fit(df)

In [10]:
# build training and test data frames

train, test = train_test_split(df, random_state=3331)

In [11]:
# start to build the TF model