<h1>XGBoost Cloud Prediction Invocation Template</h1>
<h4>Invoke SageMaker Prediction Service</h4>

In [None]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

import boto3
import re # python regex module
from sagemaker import get_execution_role
import sagemaker

# SDK 2 serializers and deserializers
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

In [None]:
# SDK 2
# RealTimePredictor renamed to Predictor
# https://sagemaker.readthedocs.io/en/stable/v2.html

# Create a predictor and point to an existing endpoint
endpoint_name = 'xgboost-bikerental-v1'
predictor = sagemaker.predictor.Predictor (endpoint_name=endpoint_name)

In [None]:
predictor.serializer = CSVSerializer()

In [None]:
df_all = pd.read_csv('bike_test.csv')

In [None]:
df_all.head()

In [None]:
df_all.columns[1:]

In [None]:
# Need to pass an array to the prediction
# can pass a numpy array or a list of values [[19,1],[20,1]]
arr_test = df_all[df_all.columns[1:]].values

In [None]:
type(arr_test)

In [None]:
arr_test.shape

In [None]:
arr_test[:5]

In [None]:
result = predictor.predict(arr_test[:2])

In [None]:
result

In [None]:
arr_test.shape

### Split the input data into chunks
There are thousands of rows in this data set for which need inference.  
When communicating over internet, it is a good idea to split the data into chunks to prevent payload and timeout error

In [None]:
# For large number of predictions, we can split the input data and
# Query the prediction service.
# array_split is convenient to specify how many splits are needed

# Splitting using regular expression as xgboost 1-2-2 is returning
# predicted values with inconsistent delimiters (comma, newline or both)

# pattern looks for one or more of non-numeric characters
pattern = r'[^0-9.]+'

predictions = []
for arr in np.array_split(arr_test,10):
    result = predictor.predict(arr)
    result = re.split(pattern,result.decode("utf-8"))

    print (arr.shape)
    predictions += [float(r) for r in result if r != ""] # Thanks, Ionut Barbu!

In [None]:
len(predictions)

In [None]:
np.expm1(predictions)

In [None]:
df_all['count'] = np.expm1(predictions)

In [None]:
df_all.head()

In [None]:
df_all[['datetime','count']].to_csv('predicted_count_cloud.csv',index=False)

In [None]:
# Delete Endpoint to prevent unnecessary charges
predictor.delete_endpoint()