## Diabetes Prediction on Electronic Medical Records (EMR) Data
### Data Preparation and Exploratory Data Analysis

##### Imports

In [None]:
%matplotlib inline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
import pandas as pd
import pylab as pl
import random

**Note:** SageMaker SDK is not used here.

### Load & Prep Data

In [None]:
df = pd.read_csv('./DATA/emr.csv')

In [None]:
df.head()

In [None]:
df.shape

### Exploratory Data Analysis

In [None]:
sns.countplot(df['class'],label='Count')
plt.show()

<b>Box Plot and Histograms</b><br>
Visualize distribution of the Input Variables

In [None]:
df.plot(kind='box', 
        subplots=True, 
        layout=(2,2), 
        sharex=False, 
        sharey=False, 
        figsize=(9,9), 
        title='Input Variables')
plt.show()

In [None]:
df.hist(bins=30, figsize=(9,9))
pl.suptitle("Input Variables")
plt.show()

Descriptive Statistics

In [None]:
df.describe()

### Scale, Split & Create Train/Test Datasets

In [None]:
X = df[['bmi', 'diastolic_bp_change', 'systolic_bp_change', 'respiratory_rate']]
y = df['class']

#### Encode the classes into numerical values using Sklearn's LabelEncoder

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(['Yes', 'No'])
y = label_encoder.transform(y)

In [None]:
y

#### Split X, y into Train and Test Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [None]:
X_train.shape

In [None]:
X_test.shape

#### Scale Feature Columns

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Combine Scaled X & y into Train and Test DataFrames 

In [None]:
X_train = pd.DataFrame(X_train, columns=['bmi', 'diastolic_bp_change', 'systolic_bp_change', 'respiratory_rate'])
y_train = pd.DataFrame(y_train, columns=['class'])
train_df = pd.concat([y_train, X_train], axis=1)
train_df.head()

In [None]:
X_test = pd.DataFrame(X_test, columns=['bmi', 'diastolic_bp_change', 'systolic_bp_change', 'respiratory_rate'])
y_test = pd.DataFrame(y_test, columns=['class'])
test_df = pd.concat([y_test, X_test], axis=1)
test_df.head()

#### Create a DataFrame for Batch Inference without the Class column

In [None]:
batch_test_df = test_df.drop(['class'], axis=1)
batch_test_df.head()

#### Write Train & Test Sets to Local Directory

In [None]:
train_df.to_csv('./DATA/train/train.csv', header=False, index=False)
test_df.to_csv('./DATA/test/test.csv', header=False, index=False)
batch_test_df.to_csv('./DATA/batch_test/batch_test.csv', header=False, index=False)