In [100]:
import boto3
from time import sleep
import subprocess
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import numpy as np
import warnings; warnings.simplefilter('ignore')

%matplotlib inline
session = boto3.Session(region_name='us-east-1') #us-east-1 is also supported

forecast = session.client(service_name='forecast')
forecastquery = session.client(service_name='forecastquery')

RELATED_TIME_SERIES Dataset Type

You can provide Amazon Forecast with related time-series datasets, such as the price or the number of web hits the item received on a particular date. The more information that you provide, the more accurate the forecast. The following fields are required:

    item_id (string)

    timestamp (timestamp)

Although the following fields are optional, Amazon Forecast suggests that you include them:

    price (float) – The price of the item at the time of the timestamp.

    webpage_hits (float) – The number of web page hits received by the item at the timestamp. Applies only to ecommerce websites.

    stockout_days (float) – The number of days left before the item goes out of stock. This is an optional field. Provide it only if the data is available.

    inventory_onhand (float) – The number of items in inventory.

    revenue (float) – The total revenue generated by that item’s sales.

    in_stock (integer; 1=true, 0=false) – A flag that specifies whether the item is in stock.

    promotion_applied (integer; 1=true, 0=false) – A flag that specifies whether there was a marketing promotion for that item at the timestamp.



In [56]:
df = pd.read_csv('train.csv',low_memory=False)

In [57]:
df = df[df['store_nbr'] == 25]

In [58]:
df.columns

Index(['id', 'date', 'store_nbr', 'item_nbr', 'unit_sales', 'onpromotion'], dtype='object')

In [59]:
df['onpromotion'].dtype

dtype('O')

In [60]:
df['onpromotion'].value_counts()

False    1531019
True      113542
Name: onpromotion, dtype: int64

In [61]:
df['onpromotion'].isnull().sum()

412459

In [62]:
df['onpromotion'] = df['onpromotion'].fillna(0)

In [68]:
df['onpromotion'] = df['onpromotion'].apply(lambda x: 1 if x==True else 0)

In [69]:
df['onpromotion'].value_counts()

0    1943478
1     113542
Name: onpromotion, dtype: int64

## Let's check if promotion seems to affect average sales

In [70]:
df.columns

Index(['id', 'date', 'store_nbr', 'item_nbr', 'unit_sales', 'onpromotion'], dtype='object')

In [73]:
df.groupby(['item_nbr', 'onpromotion'])['unit_sales'].mean().head(30)

item_nbr  onpromotion
96995     0              1.409091
99197     0              2.425101
103520    0              1.825893
          1              5.000000
103665    0              3.444840
          1              2.666667
105574    0              5.280866
          1              5.772277
105575    0              6.977792
          1              9.090909
105577    0              1.838855
          1              1.285714
105693    0              2.183381
          1              1.272727
105737    0              7.221534
105857    0              3.867117
          1              1.800000
106716    0              3.078947
108079    0              1.540984
108634    0              2.884211
          1              5.636364
108696    0              1.411765
          1              2.076923
108698    0              1.780952
          1              1.900000
108701    0              1.638436
          1              1.583333
108786    0              2.902913
          1              2

In [74]:
df_related_time_series = df[['item_nbr','date','onpromotion']]

In [75]:
df_related_time_series['date'] = pd.to_datetime(df_related_time_series['date'])

For the related time series, we use the same time filter as we used to train the previous one.

In [76]:
df_related_time_series = df_related_time_series[df['date']<'2017-08-01']

In [77]:
df_related_time_series = df_related_time_series.sort_values(['item_nbr','date'])

Same as before, we are going to assume that the days that it was uninformed the item was not on promotion

In [78]:
dfCompletedList = []

In [79]:
def completeItem(dfItem):    
    min_date = dfItem['date'].min()
    max_date = dfItem['date'].max()
    if min_date == max_date:
        #only one data point
        return
    r = pd.date_range(start=min_date, end=max_date)
    dfItemNew = dfItem.set_index('date').reindex(r).fillna(0.0).rename_axis('date').reset_index()
    dfItemNew['item_nbr'] = dfItem['item_nbr'].max()
    dfCompletedList.append(dfItemNew)

In [80]:
for index,group in df_related_time_series.groupby('item_nbr'):
    completeItem(group)

In [81]:
df_related_time_series.to_csv('related_time_series_train.csv',index=False,header=False)

In [82]:
df_related_time_series.head()

Unnamed: 0,item_nbr,date,onpromotion
1362063,96995,2013-02-04,0
3670262,96995,2013-03-31,0
3843971,96995,2013-04-04,0
3930364,96995,2013-04-06,0
4144340,96995,2013-04-11,0


In [83]:
df_related_time_series.shape

(2030901, 3)

## Upload to S3

In [84]:
s3 = session.client('s3')

accountId = boto3.client('sts').get_caller_identity().get('Account')

bucketName = 'amazon-forecast-chrisking-data-mg'# Update to your bucket name
key="favorita/related_time_series_train.csv"

s3.upload_file(Filename="related_time_series_train.csv", Bucket=bucketName, Key=key)

roleArn = 'arn:aws:iam::%s:role/amazonforecast'%accountId

## Create Data Set

In [85]:
DATASET_FREQUENCY = "D" 
TIMESTAMP_FORMAT = "yyyy-MM-dd"

In [88]:
project = 'favorita_forecast4' # Replace this with a unique name here, make sure the entire name is < 30 characters.
datasetName= project+'_dswre4'
datasetGroupName= project +'_gpwrel4'
s3DataPath = "s3://"+bucketName+"/"+key

In [89]:
# Specify the schema of your dataset here. Make sure the order of columns matches the raw data files.
schema ={
   "Attributes":[
      {
         "AttributeName":"item_id",
         "AttributeType":"string"
      },
      {
         "AttributeName":"timestamp",
         "AttributeType":"timestamp"
      },
      {
         "AttributeName":"promotion_applied",
         "AttributeType":"integer"
      }
   ]
}

response=forecast.create_dataset(
                    Domain="RETAIL",
                    DatasetType='RELATED_TIME_SERIES',
                    DataFormat='CSV',
                    DatasetName=datasetName,
                    DataFrequency=DATASET_FREQUENCY, 
                    TimeStampFormat=TIMESTAMP_FORMAT,
                    Schema = schema
                   )

In [90]:
forecast.list_datasets()

{'DatasetNames': ['acindar_dem',
  'acindar_demand_ds',
  'acindar_ds2',
  'acindar_dsv2',
  'acindar_forecast2_ds2',
  'cca_forecast_ccads',
  'favorita_forecast2_ds2',
  'favorita_forecast3_ds3',
  'favorita_forecast3_dswrel',
  'favorita_forecast4_dswre4'],
 'ResponseMetadata': {'RequestId': '270fccc4-e672-4165-924c-1be7fa5522e0',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Thu, 25 Apr 2019 19:02:24 GMT',
   'x-amzn-requestid': '270fccc4-e672-4165-924c-1be7fa5522e0',
   'content-length': '232',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

In [91]:
forecast.create_dataset_group(DatasetGroupName=datasetGroupName,RoleArn=roleArn,DatasetNames=['favorita_forecast3_ds3',datasetName])

{'DatasetGroupName': 'favorita_forecast4_gpwrel4',
 'DatasetGroupArn': 'arn:aws:forecast:us-east-1:452432741922:dsgroup/favorita_forecast4_gpwrel4',
 'ResponseMetadata': {'RequestId': 'b1904e4c-fa46-42b7-86ea-d90305a253d9',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Thu, 25 Apr 2019 19:03:01 GMT',
   'x-amzn-requestid': 'b1904e4c-fa46-42b7-86ea-d90305a253d9',
   'content-length': '144',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

In [92]:
ds_import_job_response=forecast.create_dataset_import_job(DatasetName=datasetName,Delimiter=',', DatasetGroupName =datasetGroupName ,S3Uri= s3DataPath)

In [93]:
ds_versionId=ds_import_job_response['VersionId']

In [94]:
while True:
    dataImportStatus = forecast.describe_dataset_import_job(DatasetName=datasetName,VersionId=ds_versionId)['Status']
    print(dataImportStatus)
    if dataImportStatus != 'ACTIVE' and dataImportStatus != 'FAILED':
        sleep(30)
    else:
        break

QUEUED
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
ACTIVE


In [95]:
predictorName= project+'_mqrnn_w_rel'

In [96]:
forecastHorizon = 30

In [97]:
createPredictorResponse=forecast.create_predictor(RecipeName='forecast_MQRNN',
  DatasetGroupName= datasetGroupName ,PredictorName=predictorName, 
  ForecastHorizon = forecastHorizon)

In [98]:
predictorVerionId=createPredictorResponse['VersionId']

In [None]:
while True:
    predictorStatus = forecast.describe_predictor(PredictorName=predictorName,VersionId=predictorVerionId)['Status']
    print(predictorStatus)
    if predictorStatus != 'ACTIVE' and predictorStatus != 'FAILED':
        sleep(30)
    else:
        break

CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING


In [101]:
forecastquery.get_accuracy_metrics(PredictorName=predictorName)

{'ErrorMetrics': {'Metrics': {'p10': '0.18657586720285022',
   'p50': '0.6387216073516532',
   'p90': '0.4404065424829798',
   'rmse': '5.454553422981255'},
  'MetricsByBucket': []},
 'ResponseMetadata': {'RequestId': '9340edbf-2aa7-4f72-973b-941c1419e1fc',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Fri, 26 Apr 2019 14:07:20 GMT',
   'x-amzn-requestid': '9340edbf-2aa7-4f72-973b-941c1419e1fc',
   'content-length': '160',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

## We see that the metrics improve

In Notebook 3, we have created a predictor with the following accuracy metrics: 

```python
'ModelMetrics': {'MQRNN': {'Metrics': {'p10': '0.1899730645663291',
    'p50': '0.6563960518373003',
    'p90': '0.46699407174441915',
    'rmse': '5.51639276471261'},
   'MetricsByBucket': []}},
 'ResponseMetadata': {'RequestId': '6b725d19-d4fe-4aec-9493-552f050557e9',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Mon, 11 Mar 2019 19:58:38 GMT',
   'x-amzn-requestid': '6b725d19-d4fe-4aec-9493-552f050557e9',
   'content-length': '169',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}
```

In [102]:
forecast.deploy_predictor(PredictorName=predictorName)

{'PredictorName': 'favorita_forecast4_mqrnn_w_rel',
 'VersionId': '815b8abe',
 'PredictorArn': 'arn:aws:forecast:us-east-1:452432741922:predictor/favorita_forecast4_mqrnn_w_rel',
 'ResponseMetadata': {'RequestId': 'dac3c45f-d68c-4fff-b21d-e375f84a8444',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Fri, 26 Apr 2019 14:12:16 GMT',
   'x-amzn-requestid': 'dac3c45f-d68c-4fff-b21d-e375f84a8444',
   'content-length': '171',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

In [None]:
while True:
    deployedPredictorStatus = forecast.describe_deployed_predictor(PredictorName=predictorName)['Status']
    print(deployedPredictorStatus)
    if deployedPredictorStatus != 'ACTIVE' and deployedPredictorStatus != 'FAILED':
        sleep(30)
    else:
        break
print(deployedPredictorStatus)

CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
