# Training forecasting models on traces from Marketplace
This notebooks trains a forecasting models on traces from Marketplace with DeepAR on AWS SageMaker
It follows the example [here](../sagemaker-memory-poc/sagemaker-memory-prediction-poc.ipynb)

The data comes from scraping Marketplace metrics from Datadog

This notebook will download the data needed, train a model, and test it. Just run all the cells.

In [2]:
import numpy as np
import os
import sagemaker

In [3]:
bucket = 'manifoldco-sagemaker'
prefix = 'hlnr-o'

S3_PATH = os.path.join('s3://', bucket, prefix)

MODEL_PATHS = {
    'memory-forecasting': os.path.join(S3_PATH, 'memory-forecasting', 'marketplace-poc'),
    'cpu-forecasting': os.path.join(S3_PATH, 'cpu-forecasting', 'marketplace-poc'),
    'http-latency-forecasting': os.path.join(S3_PATH, 'http-latency-forecasting', 'marketplace-poc'),
    'http-request-count-forecasting': os.path.join(S3_PATH, 'http-request-count-forecasting', 'marketplace-poc'),
}

DEEPAR_IMAGE = '522234722520.dkr.ecr.us-east-1.amazonaws.com/forecasting-deepar:latest'

## Train the model

In [4]:
freq = '5min'
context_length = 30
prediction_length = 30
sagemaker_session = sagemaker.Session()

In [14]:
for model_name, model_path in MODEL_PATHS.items():
    
    estimator = sagemaker.estimator.Estimator(
        sagemaker_session=sagemaker_session,
        role='arn:aws:iam::223261615538:role/terraform-sagemaker-role',
        image_name=DEEPAR_IMAGE,
        train_instance_count=1,
        train_instance_type='ml.m5.4xlarge',
        base_job_name=model_name + '-marketplace-poc',
        output_path=model_path,
    )

    hyperparameters  = {
        "time_freq": freq,
        "context_length": context_length,
        "prediction_length": prediction_length,
        "num_cells": "32",
        "num_layers": "2",
        "likelihood": "student-t",
        "epochs": "20",
        "mini_batch_size": "32",
        "learning_rate": "0.001",
        "dropout_rate": "0.05",
        "early_stopping_patience": "10"
    }

    estimator.set_hyperparameters(**hyperparameters)

    data_channels = {
        "train": "{}/train/".format(model_path),
    }

    estimator.fit(inputs=data_channels)

INFO:sagemaker:Creating training-job with name: memory-forecasting-marketplace-poc-2018-08-23-17-40-23-472


2018-08-23 17:40:23 Starting - Starting the training job............
2018-08-23 17:42:09 Downloading - Downloading input data
2018-08-23 17:42:16 Training - Training in-progress.........
[31mArguments: train[0m
[31m[08/23/2018 17:43:47 INFO 140093389563712] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'num_dynamic_feat': u'auto', u'dropout_rate': u'0.10', u'mini_batch_size': u'128', u'test_quantiles': u'[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'num_eval_samples': u'100', u'learning_rate': u'0.001', u'num_cells': u'40', u'num_layers': u'2', u'embedding_dimension': u'10', u'_kvstore': u'auto', u'_num_kv_servers': u'auto', u'cardinality': u'auto', u'likelihood': u'student-t', u'early_stopping_patience': u''}[0m
[31m[08/23/2018 17:43:47 INFO 140093389563712] Reading provided configuration from /opt/ml/input/config/hyperparameters.json: {u'dropout_rate': u'0.05

[31m[08/23/2018 17:43:57 INFO 140093389563712] Epoch[7] Batch[0] avg_epoch_loss=15.370348[0m
[31m[08/23/2018 17:43:57 INFO 140093389563712] Epoch[7] Batch[5] avg_epoch_loss=15.077854[0m
[31m[08/23/2018 17:43:57 INFO 140093389563712] Epoch[7] Batch [5]#011Speed: 493.97 samples/sec#011loss=15.077854[0m
[31m[08/23/2018 17:43:57 INFO 140093389563712] Epoch[7] Batch[10] avg_epoch_loss=14.963686[0m
[31m[08/23/2018 17:43:57 INFO 140093389563712] Epoch[7] Batch [10]#011Speed: 471.03 samples/sec#011loss=14.826684[0m
[31m[08/23/2018 17:43:57 INFO 140093389563712] processed a total of 347 examples[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 1196.018934249878, "sum": 1196.018934249878, "min": 1196.018934249878}}, "EndTime": 1535046237.999625, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "AWS/DeepAR"}, "StartTime": 1535046236.80338}
[0m
[31m[08/23/2018 17:43:57 INFO 140093389563712] #throughput_metric: host=algo-1, train throughput=290.10

[31m#metrics {"Metrics": {"get_graph.time": {"count": 1, "max": 1223.228931427002, "sum": 1223.228931427002, "min": 1223.228931427002}}, "EndTime": 1535046253.002928, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "AWS/DeepAR"}, "StartTime": 1535046251.779358}
[0m
[31m[08/23/2018 17:44:13 INFO 140093389563712] Number of GPUs being used: 0[0m
[31m#metrics {"Metrics": {"finalize.time": {"count": 1, "max": 2191.711902618408, "sum": 2191.711902618408, "min": 2191.711902618408}}, "EndTime": 1535046253.971383, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "AWS/DeepAR"}, "StartTime": 1535046253.003048}
[0m
[31m[08/23/2018 17:44:13 INFO 140093389563712] Serializing to /opt/ml/model/model_algo-1[0m
[31m[08/23/2018 17:44:14 INFO 140093389563712] Saved checkpoint to "/opt/ml/model/model_algo-1-0000.params"[0m
[31m#metrics {"Metrics": {"model.serialize.time": {"count": 1, "max": 159.75689888000488, "sum": 159.75689888000488, "min": 159.75

INFO:sagemaker:Creating training-job with name: cpu-forecasting-marketplace-poc-2018-08-23-17-44-47-593


2018-08-23 17:44:48 Starting - Starting the training job............
2018-08-23 17:46:51 Downloading - Downloading input data...
2018-08-23 17:46:58 Training - Training in-progress..........
[31mArguments: train[0m
[31m[08/23/2018 17:48:56 INFO 140653950183232] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'num_dynamic_feat': u'auto', u'dropout_rate': u'0.10', u'mini_batch_size': u'128', u'test_quantiles': u'[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'num_eval_samples': u'100', u'learning_rate': u'0.001', u'num_cells': u'40', u'num_layers': u'2', u'embedding_dimension': u'10', u'_kvstore': u'auto', u'_num_kv_servers': u'auto', u'cardinality': u'auto', u'likelihood': u'student-t', u'early_stopping_patience': u''}[0m
[31m[08/23/2018 17:48:56 INFO 140653950183232] Reading provided configuration from /opt/ml/input/config/hyperparameters.json: {u'dropout_rate': u'

[31m[08/23/2018 17:49:03 INFO 140653950183232] processed a total of 320 examples[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 1073.5270977020264, "sum": 1073.5270977020264, "min": 1073.5270977020264}}, "EndTime": 1535046543.817727, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "AWS/DeepAR"}, "StartTime": 1535046542.743968}
[0m
[31m[08/23/2018 17:49:03 INFO 140653950183232] #throughput_metric: host=algo-1, train throughput=298.058143419 records/second[0m
[31m[08/23/2018 17:49:03 INFO 140653950183232] #progress_metric: host=algo-1, completed 30 % of epochs[0m
[31m[08/23/2018 17:49:03 INFO 140653950183232] loss did not improve for 2 epochs[0m
[31m[08/23/2018 17:49:04 INFO 140653950183232] Epoch[6] Batch[0] avg_epoch_loss=14.985869[0m
[31m[08/23/2018 17:49:04 INFO 140653950183232] Epoch[6] Batch[5] avg_epoch_loss=14.760305[0m
[31m[08/23/2018 17:49:04 INFO 140653950183232] Epoch[6] Batch [5]#011Speed: 539.22 samples/sec#011loss=14.7


2018-08-23 17:49:25 Uploading - Uploading generated training model[31m[08/23/2018 17:49:14 INFO 140653950183232] Epoch[15] Batch[0] avg_epoch_loss=14.995030[0m
[31m[08/23/2018 17:49:14 INFO 140653950183232] Epoch[15] Batch[5] avg_epoch_loss=14.919340[0m
[31m[08/23/2018 17:49:14 INFO 140653950183232] Epoch[15] Batch [5]#011Speed: 548.96 samples/sec#011loss=14.919340[0m
[31m[08/23/2018 17:49:14 INFO 140653950183232] processed a total of 317 examples[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 1044.6219444274902, "sum": 1044.6219444274902, "min": 1044.6219444274902}}, "EndTime": 1535046554.543962, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "AWS/DeepAR"}, "StartTime": 1535046553.499295}
[0m
[31m[08/23/2018 17:49:14 INFO 140653950183232] #throughput_metric: host=algo-1, train throughput=303.433511699 records/second[0m
[31m[08/23/2018 17:49:14 INFO 140653950183232] #progress_metric: host=algo-1, completed 80 % of epochs[0m
[31m[


2018-08-23 17:49:31 Completed - Training job completed


INFO:sagemaker:Creating training-job with name: http-latency-forecasting-marketplace-po-2018-08-23-17-50-12-874


Billable seconds: 160
2018-08-23 17:50:13 Starting - Starting the training job............
2018-08-23 17:51:53 Downloading - Downloading input data
2018-08-23 17:52:00 Training - Training in-progress.........
[31mArguments: train[0m
[31m[08/23/2018 17:53:34 INFO 140677364438848] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'num_dynamic_feat': u'auto', u'dropout_rate': u'0.10', u'mini_batch_size': u'128', u'test_quantiles': u'[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'num_eval_samples': u'100', u'learning_rate': u'0.001', u'num_cells': u'40', u'num_layers': u'2', u'embedding_dimension': u'10', u'_kvstore': u'auto', u'_num_kv_servers': u'auto', u'cardinality': u'auto', u'likelihood': u'student-t', u'early_stopping_patience': u''}[0m
[31m[08/23/2018 17:53:34 INFO 140677364438848] Reading provided configuration from /opt/ml/input/config/hyperparameters.json: {u

[31m[08/23/2018 17:53:50 INFO 140677364438848] Epoch[13] Batch[0] avg_epoch_loss=2.844733[0m
[31m[08/23/2018 17:53:50 INFO 140677364438848] Epoch[13] Batch[5] avg_epoch_loss=2.850773[0m
[31m[08/23/2018 17:53:50 INFO 140677364438848] Epoch[13] Batch [5]#011Speed: 494.53 samples/sec#011loss=2.850773[0m
[31m[08/23/2018 17:53:50 INFO 140677364438848] processed a total of 280 examples[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 935.5719089508057, "sum": 935.5719089508057, "min": 935.5719089508057}}, "EndTime": 1535046830.724603, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "AWS/DeepAR"}, "StartTime": 1535046829.788808}
[0m
[31m[08/23/2018 17:53:50 INFO 140677364438848] #throughput_metric: host=algo-1, train throughput=299.252056584 records/second[0m
[31m[08/23/2018 17:53:50 INFO 140677364438848] #progress_metric: host=algo-1, completed 70 % of epochs[0m
[31m[08/23/2018 17:53:50 INFO 140677364438848] best epoch loss so far[0m
[31

[31m#metrics {"Metrics": {"totaltime": {"count": 1, "max": 25606.745958328247, "sum": 25606.745958328247, "min": 25606.745958328247}, "setuptime": {"count": 1, "max": 6.903886795043945, "sum": 6.903886795043945, "min": 6.903886795043945}}, "EndTime": 1535046840.043929, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "AWS/DeepAR"}, "StartTime": 1535046839.780626}
[0m

2018-08-23 17:54:04 Uploading - Uploading generated training model
2018-08-23 17:54:09 Completed - Training job completed


INFO:sagemaker:Creating training-job with name: http-request-count-forecasting-marketpl-2018-08-23-17-54-34-508


Billable seconds: 136
2018-08-23 17:54:34 Starting - Starting the training job............
2018-08-23 17:56:13 Downloading - Downloading input data
2018-08-23 17:56:19 Training - Training in-progress........
[31mArguments: train[0m
[31m[08/23/2018 17:57:53 INFO 140477245183808] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'num_dynamic_feat': u'auto', u'dropout_rate': u'0.10', u'mini_batch_size': u'128', u'test_quantiles': u'[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'num_eval_samples': u'100', u'learning_rate': u'0.001', u'num_cells': u'40', u'num_layers': u'2', u'embedding_dimension': u'10', u'_kvstore': u'auto', u'_num_kv_servers': u'auto', u'cardinality': u'auto', u'likelihood': u'student-t', u'early_stopping_patience': u''}[0m
[31m[08/23/2018 17:57:53 INFO 140477245183808] Reading provided configuration from /opt/ml/input/config/hyperparameters.json: {u'

[31m[08/23/2018 17:58:01 INFO 140477245183808] Epoch[6] Batch[0] avg_epoch_loss=1.526797[0m
[31m[08/23/2018 17:58:02 INFO 140477245183808] Epoch[6] Batch[5] avg_epoch_loss=1.470686[0m
[31m[08/23/2018 17:58:02 INFO 140477245183808] Epoch[6] Batch [5]#011Speed: 459.39 samples/sec#011loss=1.470686[0m
[31m[08/23/2018 17:58:02 INFO 140477245183808] processed a total of 312 examples[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 1127.4418830871582, "sum": 1127.4418830871582, "min": 1127.4418830871582}}, "EndTime": 1535047082.580252, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "AWS/DeepAR"}, "StartTime": 1535047081.452583}
[0m
[31m[08/23/2018 17:58:02 INFO 140477245183808] #throughput_metric: host=algo-1, train throughput=276.707623304 records/second[0m
[31m[08/23/2018 17:58:02 INFO 140477245183808] #progress_metric: host=algo-1, completed 35 % of epochs[0m
[31m[08/23/2018 17:58:02 INFO 140477245183808] best epoch loss so far[0m
[31

[31m[08/23/2018 17:58:11 INFO 140477245183808] Saved checkpoint to "/opt/ml/model/state_f7b490d7-2581-497e-974e-d3d11437aec3-0000.params"[0m
[31m#metrics {"Metrics": {"state.serialize.time": {"count": 1, "max": 20.440101623535156, "sum": 20.440101623535156, "min": 20.440101623535156}}, "EndTime": 1535047091.523418, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "AWS/DeepAR"}, "StartTime": 1535047091.502745}
[0m
[31m[08/23/2018 17:58:12 INFO 140477245183808] Epoch[15] Batch[0] avg_epoch_loss=0.772145[0m
[31m[08/23/2018 17:58:12 INFO 140477245183808] Epoch[15] Batch[5] avg_epoch_loss=0.736896[0m
[31m[08/23/2018 17:58:12 INFO 140477245183808] Epoch[15] Batch [5]#011Speed: 592.28 samples/sec#011loss=0.736896[0m
[31m[08/23/2018 17:58:12 INFO 140477245183808] Epoch[15] Batch[10] avg_epoch_loss=0.826200[0m
[31m[08/23/2018 17:58:12 INFO 140477245183808] Epoch[15] Batch [10]#011Speed: 568.62 samples/sec#011loss=0.933365[0m
[31m[08/23/2018 17:58:12 INFO 140


2018-08-23 17:58:23 Uploading - Uploading generated training model
2018-08-23 17:58:29 Completed - Training job completed
Billable seconds: 136


## Create an endpoint and test model

In [16]:


endpoint_name = sagemaker_session.endpoint_from_job(
    job_name='http-latency-forecasting-marketplace-po-2018-08-23-17-50-12-874',
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
    deployment_image=DEEPAR_IMAGE,
    role='arn:aws:iam::223261615538:role/terraform-sagemaker-role'
)



INFO:sagemaker:Creating model with name: http-latency-forecasting-marketplace-po-2018-08-23-17-50-12-874
INFO:sagemaker:Creating endpoint-config with name http-latency-forecasting-marketplace-po-2018-08-23-17-50-12-874
INFO:sagemaker:Creating endpoint with name http-latency-forecasting-marketplace-po-2018-08-23-17-50-12-874


--------------------------------------------------------------------------!

In [17]:
import json 

class DeepARPredictor(sagemaker.predictor.RealTimePredictor):

    def set_prediction_parameters(self, freq, prediction_length):
        """Set the time frequency and prediction length parameters. This method **must** be called
        before being able to use `predict`.
        
        Parameters:
        freq -- string indicating the time frequency
        prediction_length -- integer, number of predicted time points
        
        Return value: none.
        """
        self.freq = freq
        self.prediction_length = prediction_length
        
    def predict(self, ts, encoding="utf-8", num_samples=100, quantiles=["0.1", "0.5", "0.9"]):
        """Requests the prediction of for the time series listed in `ts`, each with the (optional)
        corresponding category listed in `cat`.
        
        Parameters:
        ts -- list of `pandas.Series` objects, the time series to predict
        cat -- list of integers (default: None)
        encoding -- string, encoding to use for the request (default: "utf-8")
        num_samples -- integer, number of samples to compute at prediction time (default: 100)
        quantiles -- list of strings specifying the quantiles to compute (default: ["0.1", "0.5", "0.9"])
        
        Return value: list of `pandas.DataFrame` objects, each containing the predictions
        """
        prediction_times = ["2018-08-22 13:36:10"]
        req = self.__encode_request(ts, encoding, num_samples, quantiles)
        res = super(DeepARPredictor, self).predict(req)
        return self.__decode_response(res, prediction_times, encoding)
    
    def __encode_request(self, ts, encoding, num_samples, quantiles):
        instances = [series_to_obj(ts)]
        
        configuration = {"num_samples": num_samples, "output_types": ["quantiles"], "quantiles": quantiles}
        http_request_data = {"instances": instances, "configuration": configuration}
        return json.dumps(http_request_data).encode(encoding)
    
    def __decode_response(self, response, prediction_times, encoding):
        response_data = json.loads(response.decode(encoding))
        list_of_df = []
        for k in range(len(prediction_times)):
            prediction_index = pd.DatetimeIndex(start=prediction_times[k], freq=self.freq, periods=self.prediction_length)
            list_of_df.append(pd.DataFrame(data=response_data['predictions'][k]['quantiles'], index=prediction_index))
        return list_of_df

    
def series_to_obj(ts):
    obj = {"start": "2018-08-22 13:36:10", "target": ts}
    return obj

In [18]:
import pandas as pd
df = pd.read_json('./data/http_request_query_10.json', lines=True)
df = df.set_index('start')['target']
df.index = pd.to_datetime(df.index)

timeseries_test, timeseries_training = [], []
for newseries in df:
    timeseries_test.append(newseries)
    timeseries_training.append(newseries[:-prediction_length])
    
new_time_series_training = []
for ts in timeseries_training:
    new_time_series_training.append(ts)

new_time_series_test = []
for ts in timeseries_test:
    new_time_series_test.append(ts)

In [19]:
predicto  = DeepARPredictor(
    endpoint=endpoint_name,
    sagemaker_session=sagemaker_session,
    content_type="application/json"
)

predicto.set_prediction_parameters(freq, prediction_length)
list_of_df  = predicto.predict(new_time_series_training[2])[0] # predicted forecast
actual_data = pd.Series(df[2], index=pd.date_range('2018-08-22 13:36:10', periods=len(df[2]), freq='5s')) # full data set

In [30]:
sagemaker_session.delete_endpoint(endpoint_name)

INFO:sagemaker:Deleting endpoint with name: http-request-count-forecasting-marketpl-2018-08-23-17-54-34-508
