In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

%matplotlib inline

# Read Data

In [2]:
# Read file and pass in 
diabetes = pd.read_csv("diabetic_data.csv")

diabetes.head()

# Medications - Will need to drop once dummified 
medications = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone']

# Isolation Medications and Result 
med_eda = diabetes[['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone','readmitted']]



# Make Output Binary

In [3]:
# Binary Classfication of Readmission Attribute
diabetes.readmitted.replace('NO','otherwise', inplace = True)
diabetes.readmitted.replace('>30','otherwise', inplace = True)
diabetes.readmitted.replace('<30','readmitted', inplace = True)

# Dummification

#### Easy Dummification 
- gender (easy: just drop NAs + make 0 or 1)  
- change (easy: 0 or 1) 
- diabetesMed (easy: 0 or 1) 

#### Columns to Dummify
- race (consolidated sparse data in larger buckets 3) 
- age (dummify buckets) 
- admission_type_id (replace numbers with corresponding qualitative value and bucket/dummify) 
- discharge_disposition_id (replace numbers with corresponding qualitative value and bucket/ dummify) 
- admission_source_id (replace numbers with corresponding qualitative value and dummify) 
- medical_specialty (replace numbers with corresponding qualitative value and dummify) 
- diag_1 (bucket / dummify)
- diag_2 (bucket / dummify)
- diag_3 (bucket / dummify)
- max_glu_serum (bucket / dummify)
- A1Cresult (bucket / dummify)
- medications





### Easy Dummification

In [4]:
# DiabetesMed - Dummify
diabetes.diabetesMed.replace('Yes',1, inplace = True)
diabetes.diabetesMed.replace('No',0, inplace = True)

# DiabetesChange - "Dummify"
diabetes.change.replace('Ch',1, inplace = True)
diabetes.change.replace('No',0, inplace = True)
 
# Diabetes.Gender - Dummify + Drop Unknowns
diabetes.gender.replace('Male', 1, inplace = True)
diabetes.gender.replace('Female', 0, inplace = True)
diabetes = diabetes[diabetes.gender != 'Unknown/Invalid'] # drop unknown

### Race Dummification

In [5]:
diabetes.race = [x if x in ['Caucasian', 'AfricanAmerican'] else 'Other' for x in diabetes.race]

In [6]:
diabetes.race.value_counts()
race_dummy = pd.get_dummies(diabetes.race, prefix = "race", drop_first=True)


### Age Dummification - New Buckets = <30, 30-60, [60-100)  

In [7]:
diabetes.age = ['[60, 100)' if x in ['[70-80)','[60-70)','[80-90)','[90-100)'] else 
 '[30, 60)' if x in ['[50-60)','[40-50)','[30-40)'] else 
 '<30' for x in diabetes.age]


In [8]:
diabetes.age.value_counts()
age_dummy = pd.get_dummies(diabetes.age, prefix = 'age', drop_first = True)

### Admissions Type Dummification - Drop 

In [9]:
diabetes.drop(columns = ['admission_type_id'], inplace = True)

### Admissions Source Dummification

In [11]:
# Initialize List of Emergency and Referral Patients
emergency = [7]
referral = [1,2,3]

diabetes.admission_source_id = ['Referral' if x in referral else 
 'Emergency' if x in emergency else 
 'Other' for x in diabetes.admission_source_id]


Emergency    57492
Referral     30855
Other        13416
Name: admission_source_id, dtype: int64

In [17]:
diabetes.admission_source_id.value_counts()
admissions_dummy = pd.get_dummies(diabetes.admission_source_id, prefix = 'Admissions', drop_first=True)

### Discharge Disposition - Classify into Home and Other, and drop Expired (dead)

In [15]:
# Initialize a list of those that are discharged to home and dead using IDs from IDs file 
expired = [11,19,20,21]
home = [1,6,8,13]

diabetes.discharge_disposition_id = ['Home' if x in home else 
           'Expired' if x in expired else 
           'Other' for x in diabetes.discharge_disposition_id]

diabetes = diabetes[diabetes.discharge_disposition_id != 'Expired'] # drop unknown

In [16]:
diabetes.discharge_disposition_id.value_counts()
discharge_dummy = pd.get_dummies(diabetes.discharge_disposition_id, prefix = 'discharge', drop_first=True)

### Medical Specialty Dummification - Cardiology, General Practice, Internal, MIssing, Other, Surgery

In [18]:
diabetes.medical_specialty.value_counts()
cardiology = []
generalpractice = []
internalmedicine = []
missing = []
surgery = []

?                                49127
InternalMedicine                 14328
Emergency/Trauma                  7449
Family/GeneralPractice            7302
Cardiology                        5295
                                 ...  
Neurophysiology                      1
Psychiatry-Addictive                 1
Pediatrics-InfectiousDiseases        1
Speech                               1
SportsMedicine                       1
Name: medical_specialty, Length: 73, dtype: int64

# Identify Missing Values / Drop Data 

In [None]:
missingcols = ['race', 'weight','payer_code','medical_specialty', 'diag_1','diag_2','diag_3']

In [None]:
# diabetes.replace('?', np.nan, inplace = True)
# diabetes.isnull().sum(axis =0)

# Dropping Data 
- weight (97% missing)
- payer_code - not relevant to outcome
- examide, citoglipton - no variance
- Before Train-Test-Split - will need ot drop encounter and patient nbr 

In [None]:
# Drop columns
diabetes.drop(columns = ['weight','payer_code'], inplace = True)
diabetes.drop(columns = medications, inplace = True)

# Drop Rows 
diabetes = diabetes[diabetes.gender != 'Unknown/Invalid']

In [None]:
diabetes.head().T

In [None]:
diabetes.admission_source_id.value_counts()

# Feature Engineering


In [None]:
med_eda.shape

In [None]:
# New List of Medications 
medications = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone']

# Function - med_change

In [None]:
# Functions: 
def med_change(df, column_list):
    '''
    This function takes utilizes the medication features as identified in the 
    column_list argument to derive a feature called medchange which counts the number
    of medications which have dosages that have changed - regardless of dosages 
    '''
    for x in column_list:
        tempname = str(x) + 'temp'
        df[tempname] = df[x].apply(lambda x: 0 if (x == "No" or x == "Steady") else 1)
    df['medchange'] = 0 
    for x in column_list:
        tempname = str(x) + 'temp'
        df['medchange'] = df['medchange'] + df[tempname]
        del df[tempname]
        
    return df.head().T


# Testing Function
med_eda2 = med_eda.copy()
med_change(med_eda2, medications)


# Dummify Medications

In [None]:
# Dummify Medications
metformin_dummy = pd.get_dummies(med_eda['metformin'], prefix = 'metformin', drop_first=True)
repaglinide_dummy = pd.get_dummies(med_eda['repaglinide'], prefix = 'repaglinide', drop_first=True)
nateglinide_dummy = pd.get_dummies(med_eda['nateglinide'], prefix = 'nateglinide', drop_first=True)
chlorpropamide_dummy = pd.get_dummies(med_eda['chlorpropamide'], prefix = 'chlorpropamide', drop_first=True)
glimepiride_dummy = pd.get_dummies(med_eda['glimepiride'], prefix = 'glimepiride', drop_first=True)
acetohexamide_dummy = pd.get_dummies(med_eda['acetohexamide'], prefix = 'acetohexamide', drop_first=True)
glipizide_dummy = pd.get_dummies(med_eda['glipizide'], prefix = 'glipizide', drop_first=True)
glyburide_dummy = pd.get_dummies(med_eda['glyburide'], prefix = 'glyburide', drop_first=True)
tolbutamide_dummy = pd.get_dummies(med_eda['tolbutamide'], prefix = 'tolbutamide', drop_first=True)
pioglitazone_dummy = pd.get_dummies(med_eda['pioglitazone'], prefix = 'pioglitazone', drop_first=True)
rosiglitazone_dummy = pd.get_dummies(med_eda['rosiglitazone'], prefix = 'rosiglitazone', drop_first=True)
acarbose_dummy = pd.get_dummies(med_eda['acarbose'], prefix = 'acarbose', drop_first=True)
miglitol_dummy = pd.get_dummies(med_eda['miglitol'], prefix = 'miglitol', drop_first=True)
troglitazone_dummy = pd.get_dummies(med_eda['troglitazone'], prefix = 'troglitazone', drop_first=True)
tolazamide_dummy = pd.get_dummies(med_eda['tolazamide'], prefix = 'tolazamide', drop_first=True)
insulin_dummy = pd.get_dummies(med_eda['insulin'], prefix = 'insulin', drop_first=True)
glyburide_metformin_dummy = pd.get_dummies(med_eda['glyburide-metformin'], prefix = 'glyburide-metformin', drop_first=True)
glipizide_metformin_dummy = pd.get_dummies(med_eda['glipizide-metformin'], prefix = 'glipizide-metformin', drop_first=True)
glimepiride_pioglitazone_dummy = pd.get_dummies(med_eda['glimepiride-pioglitazone'], prefix = 'glimepiride-pioglitazone', drop_first=True)
metformin_rosiglitazone_dummy = pd.get_dummies(med_eda['metformin-rosiglitazone'], prefix = 'metformin-rosiglitazone', drop_first=True)
metformin_pioglitazone_dummy = pd.get_dummies(med_eda['metformin-pioglitazone'], prefix = 'metformin-pioglitazone', drop_first=True)


In [None]:
dummy_cols = [metformin_dummy, repaglinide_dummy,nateglinide_dummy, chlorpropamide_dummy, 
 glimepiride_dummy, acetohexamide_dummy, glipizide_dummy, glyburide_dummy,
 tolbutamide_dummy, pioglitazone_dummy, rosiglitazone_dummy, acarbose_dummy, 
 miglitol_dummy, troglitazone_dummy, tolazamide_dummy, insulin_dummy, 
 glyburide_metformin_dummy, glipizide_metformin_dummy, glimepiride_pioglitazone_dummy,
 metformin_pioglitazone_dummy, metformin_rosiglitazone_dummy]

In [None]:
med_eda_final = pd.concat([x for x in dummy_cols], axis =1)
med_eda_final = pd.concat([diabetes.iloc[:,:2], med_eda_final], axis = 1)

In [None]:
med_eda_final

# Write CSV 

In [None]:
# Write a Function
# med_eda.drop(columns = ['readmitted'], inplace = True)
# med_eda.to_csv('./med_eda.csv', index = False)
# med_eda_final.to_csv('./med_eda_dummified', index = False)

# Evaluating Model 

In [None]:
# from sklearn.metrics import classification_report
# print(classification_report(y_test, y_preds)