# Bring Your Own Model
<b> Using EMR Dataset for Diabetes Prediction </b>

### Load & Prepare Data

In [None]:
import pandas as pd 

In [None]:
patients = pd.read_csv('./DATA/patients.csv')
diagnosis = pd.read_csv('./DATA/diagnosis.csv')
specialities = pd.read_csv('./DATA/specialities.csv')
transcripts = pd.read_csv('./DATA/transcripts.csv')
medications = pd.read_csv('./DATA/medications.csv')

In [None]:
patients.head(2)

In [None]:
diagnosis.head(2)

In [None]:
medications.columns = map(lambda x: 'med_' + x, medications.columns)
medications.head(2)

In [None]:
transcripts.head(2)

In [None]:
specialities.head(2)

In [None]:
data = pd.concat([diagnosis, transcripts, patients], axis=1)
# Handle missing values 
data.fillna(0, inplace=True)

In [None]:
data.head(3)

#### Feature Selection

In [None]:
categorical = ['Gender','State']
categorical += [col for col in data.columns if 'Icd' in col]

In [None]:
categorical

In [None]:
numerical = data[[col for col in data.columns if (col not in categorical)]].columns.tolist()

In [None]:
numerical

In [None]:
not_important = ['DiagnosisFreq', 'AcuteCount', 'Height_Max', 'Temperature_Max', 'Weight_Min', 'BMI_Min', 
                 'RespiratoryRate_Min', 'Temperature_Std', 'Height_Mean', 'Height_Change', 'RespiratoryRate_Change', 
                 'Temperature_Change']

In [None]:
features = set(numerical) - set(not_important)
features = list(features)

In [None]:
df = data[features]
df.drop('PatientGuid', axis=1, inplace=True)
df.head()

#### Split Data

In [None]:
from sklearn.model_selection import train_test_split


df = df.sample(frac=1)
train, test = train_test_split(df, test_size=0.2)
train.to_csv('./DATA/train.csv', index=False)
test.to_csv('./DATA/test.csv', index=False)

In [None]:
train.shape

In [None]:
test.shape

#### Push Data to S3 

In [None]:
from datetime import datetime
import boto3

In [None]:
bucket = 'sagemaker-diabetes-prediction'

In [None]:
s3_session = boto3.Session().resource('s3')

In [None]:
s3_session.create_bucket(Bucket=bucket)
s3_session.Bucket(bucket).Object('train/train.csv').upload_file('./DATA/train.csv')

### Prepare your Model Script 

In [None]:
%%file diabetes-prediction.py

import warnings; warnings.simplefilter('ignore')

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from numpy import mean
import pandas as pd
import numpy as np
import argparse
import pickle 
import json
import os 


def model_fn(model_dir):
    """
    Load model created by Sagemaker training.
    """
    model = pickle.load(open(os.path.join(model_dir, 'model'), 'rb'))
    # Load caches/vectorizers/transformers here if needed
    return model


def input_fn(request_body, request_content_type):
    if request_content_type == 'application/json':
        # Add logic to transform incoming request or payload here if needed
        return request_body
    else:
        raise ValueError("The model only supports application/json input")


def predict_fn(input_data, model):
    X = json.loads(input_data)
    X = np.array([X]).reshape(1, -1)
    return model.predict(X)


def output_fn(prediction, content_type):
    # Add logic to transform output prediction or response here 
    out = {'prediction': prediction[0]}
    return out

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    args = parser.parse_args()
    
    # ------------------------- YOUR MODEL TRAINING LOGIC STARTS HERE -------------------------
    # Load data from the location specified by args.train (In this case, an S3 bucket)
    scaler = MinMaxScaler()
    
    train_data = pd.read_csv(os.path.join(args.train, 'train.csv'))
    columns = train_data.columns.tolist()
    train_data = scaler.fit_transform(train_data)
    train_df = pd.DataFrame(train_data, columns=columns)
    
    y_train = train_df['DMIndicator']
    X_train = train_df.drop('DMIndicator', axis=1)

    model = RandomForestClassifier(n_estimators=227, 
                             max_depth=10, 
                             max_features='auto', 
                             class_weight='balanced')
    model.fit(X_train, y_train)
    # Save the model to the location specified by args.model_dir
    pickle.dump(model, open(os.path.join(args.model_dir, 'model'), 'wb'))
    # ------------------------- YOUR MODEL TRAINING LOGIC STOPS HERE -------------------------

### Train Model using SageMaker Training 

In [None]:
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role

In [None]:
role = get_execution_role()

#### Create SageMaker SKLearn Estimator

In [None]:
estimator = SKLearn(entry_point='diabetes-prediction.py',
                    train_instance_type='ml.m4.xlarge',
                    train_instance_count=1, # Scikit-Learn does not support distributed training
                    role=role)

In [None]:
estimator.__dict__

In [None]:
# Train the model by passing the path to the S3 bucket containing the training data
estimator.fit({'train': 's3://{}/train'.format(bucket)})

In [None]:
sagemaker_job_name = estimator.hyperparameters()['sagemaker_job_name'][1:-1]
sagemaker_job_name

### Test Trained Model Locally

#### Download Trained Model from S3 to Local

In [None]:
import tarfile
import pickle
import boto3


# Create S3 client   
bucket = 'sagemaker-us-east-1-892313895307'
key = '{}/output/model.tar.gz'.format(sagemaker_job_name)
local_model_tar = 'model.tar.gz'
local_model_path = './DATA/{}'.format(local_model_tar)

s3_client = boto3.resource('s3')  
s3_client.Object(bucket, key).download_file(local_model_path)
                                     
# Uncompress
tar = tarfile.open(local_model_path, 'r:gz')
tar.extract('model','./DATA/')
tar.close()

# Load local model
trained_model = pickle.load(open('./DATA/model', 'rb'))

In [None]:
def model_fn(model_dir):
    """
    Load model created by Sagemaker training.
    """
    model = pickle.load(open(model_dir, 'rb'))
    # Load caches/vectorizers/transformers here if needed
    return model


def input_fn(request_body, request_content_type):
    if request_content_type == 'application/json':
        # Add logic to transform incoming request or payload here if needed
        return request_body
    else:
        raise ValueError("The model only supports application/json input")


def predict_fn(input_data, model):
    X = json.loads(input_data)
    X = np.array([X]).reshape(1, -1)
    return model.predict(X)


def output_fn(prediction, content_type):
    # Add logic to transform output prediction or response here 
    out = {'prediction': prediction[0]}
    return out

#### Test Contracts using Loaded Local Model

In [None]:
model_fn('./DATA/model')

X = [0.11764705882352944,
     0.28103467751044836,
     0.3828168310705288,
     0.4500000000000002,
     0.21339252096527772,
     0.14285714285714285,
     0.37731203127272733,
     0.3200722754898495,
     0.18604651162790709,
     0.123577893392233,
     0.146005360532692,
     0.25,
     0.29641195141313426,
     0.6845679012345679,
     0.8893129770992365,
     0.37457669566732366,
     0.566510762819048,
     0.8451278752421079,
     0.1042988507084566,
     0.29536806395243054,
     0.8055555555555556,
     0.5811981560536253,
     0.4551585868073491,
     0.6600139428866417,
     0.40005396350833333,
     0.05263157894736842,
     0.40478776897287494,
     0.6056338028169015,
     0.517445664668329
    ]

request_body = json.dumps(X)
content_type = 'application/json'
input_fn_out = input_fn(request_body=request_body, request_content_type=content_type)
print(input_fn_out)
predict_fn_out = predict_fn(input_fn_out, trained_model)
print(predict_fn_out)
output_fn_out = output_fn(prediction=predict_fn_out, content_type='application/json')
print(output_fn_out)

### Deploy Trained Model using SageMaker Endpoints

In [None]:
predictor = estimator.deploy(instance_type='ml.m4.xlarge', 
                             initial_instance_count=1)

In [None]:
predictor.endpoint

### Evaluate Deployed Model

In [None]:
# Create Sagemaker run-time client using boto3
client = boto3.client('sagemaker-runtime')

endpoint_name = predictor.endpoint

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

test_data = pd.read_csv('./DATA/test.csv')
columns = test_data.columns.tolist()
test_data = scaler.fit_transform(test_data)
test_df = pd.DataFrame(test_data, columns=columns)

y_test_df = test_df['DMIndicator']
X_test_df = test_df.drop('DMIndicator', axis=1)

In [None]:
X_test = X_test_df.values.tolist()
y_test = y_test_df.values.tolist()

In [None]:
X_test[0]

In [None]:
y_test[:20]

In [None]:
y_predicted = []
for i, X in enumerate(X_test):
    response = client.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=json.dumps(X),
    ContentType='application/json')
    y = response['Body'].read()
    y = y.decode('utf-8')
    y = json.loads(y)['prediction']
    y_predicted.append(y)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score


print("================== Confusion Matrix ==================")
print(confusion_matrix(y_test, y_predicted))
print('\n')
print("================== Classification Report ==================")
print(classification_report(y_test, y_predicted))

In [None]:
# Uncomment and run to terminate the endpoint after you are finished
# predictor.delete_endpoint()