### Data Cleaning 

In [1]:
# Import Libraries 
import numpy as np
import pandas as pd
import os

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%pwd # checks on working directory 

'/Users/pmaddo/Documents/Collaboration/Intro to Python R for Data Statistis and Data Science- WiDS'

In [3]:
# Load data 
data = pd.read_csv("dataset/diabetic.csv", sep=',')

In [4]:
data.columns #get column names 
data.shape #dimension of data 

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

(10000, 50)

In [5]:
data.info() # Check structure 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 50 columns):
encounter_id                10000 non-null int64
patient_nbr                 10000 non-null int64
race                        10000 non-null object
gender                      10000 non-null object
age                         10000 non-null object
weight                      10000 non-null object
admission_type_id           10000 non-null int64
discharge_disposition_id    10000 non-null int64
admission_source_id         10000 non-null int64
time_in_hospital            10000 non-null int64
payer_code                  10000 non-null object
medical_specialty           10000 non-null object
num_lab_procedures          10000 non-null int64
num_procedures              10000 non-null int64
num_medications             10000 non-null int64
number_outpatient           10000 non-null int64
number_emergency            10000 non-null int64
number_inpatient            10000 non-null int64
diag

In [6]:
data.head() # display first 6 rows 
data.tail() # display tail o

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
9995,42801564,25911072,Caucasian,Female,[80-90),?,5,3,17,3,...,No,Steady,No,No,No,No,No,No,Yes,>30
9996,42802962,19316196,Caucasian,Female,[80-90),?,2,18,1,4,...,No,No,No,No,No,No,No,No,No,>30
9997,42805614,19316448,Caucasian,Male,[50-60),?,1,18,7,14,...,No,No,No,No,No,No,No,No,No,NO
9998,42808980,987381,Caucasian,Female,[50-60),?,1,1,7,2,...,No,No,No,No,No,No,No,No,No,NO
9999,42809670,23724657,Caucasian,Male,[80-90),?,1,3,17,4,...,No,No,No,No,No,No,No,No,Yes,NO


The data set contains cells with entry "?", which corresponds to missing data. As such, we will replace "?" with NaN for python to easily interpret as missing values 

In [7]:
data=data.replace('?', np.nan) # Replace ? by NaN

In [8]:
data.iloc[:3]  # Extract first three rows

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO


In [9]:
data.isnull().sum() # check for missing values 

encounter_id                    0
patient_nbr                     0
race                          184
gender                          0
age                             0
weight                       9725
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  10000
medical_specialty            3629
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          6
diag_2                         63
diag_3                        300
number_diagnoses                0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [10]:
#Check Class distribution 
readmitted_counts = data.groupby('readmitted').size()
print(readmitted_counts)

readmitted
<30    1098
>30    3834
NO     5068
dtype: int64


Data is ready for EDA 

### EDA

### Model Building and Evaluation

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

### Splitting the Data

In [None]:
x_train, x_val, y_train, y_val = train_test_split(X,y)

### Building a Logistic Regresison Model

In [None]:
lr_model = LogisticRegression()
lr_model.fit_transform(x_train, y_train)

lr_predicted_values = lr_model.predit(x_val)

lr_accuracy = accuracy_score(y_val, lr_predicted_values)

### Building a Decision Tree Model

In [3]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree_model = DecisionTreeClassifier()
tree_model.fit_transform(x_train, y_train)

tree_predicted_values = tree_model.predict(x_val)

tree_accuracy = model.accuracy_score(y_val, tree_predicted_values)