****Important – Do not use in production, for demonstration purposes only – please review the legal notices before continuing****

# Retail In-Store Customer Wait Time Forecasting with AutoML

In this notebook, we use Azure AutoML to forecast the average wait time of customers using data from multiple Wide World Importers stores.

 ![Image](https://stretailprod.blob.core.windows.net/notebookimages/wait_time.jpg?sp=r&st=2022-02-25T19:10:16Z&se=2024-02-26T03:10:16Z&sv=2020-08-04&sr=b&sig=VU0rM0ICxMcekloQuG3OPxcvUWOcifIQglHhttbQDiQ%3D)

### Setting up the workspace

In [None]:
import azureml.core
import pandas as pd

print("SDK Version:", azureml.core.VERSION)

from azureml.core import Workspace, Datastore, Dataset
ws = Workspace.from_config()
ws

#### Create new datastore for Datasets

In [2]:
import GlobalVariables as gv
from azureml.data.datapath import DataPath
from azure.storage.blob import ContainerClient, BlobClient
from io import BytesIO

### Reading Data 

In [3]:
# Reading data from azure blob storage
blob = BlobClient.from_connection_string(conn_str=gv.STORAGE_ACCOUNT_CONNECTION_STRING, container_name=gv.CONTAINER_NAME, blob_name='customer_wait_time.csv')
blob_data = blob.download_blob()
BytesIO(blob_data.content_as_bytes())
data = pd.read_csv(BytesIO(blob_data.content_as_bytes()))
customer_df = data.copy()

### EDA

In [4]:
customer_df.head()

Unnamed: 0,city,date,wait_time
0,London,2018-06-28 17:46:00,6.2
1,San Diego,2021-06-18 20:47:00,7.6
2,London,2019-08-10 23:47:00,7.0
3,London,2021-11-04 02:59:00,8.4
4,New York City,2018-06-11 17:28:00,10.0


In [5]:
# View info to see what the column names and types are
customer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4369400 entries, 0 to 4369399
Data columns (total 3 columns):
city         object
date         object
wait_time    float64
dtypes: float64(1), object(2)
memory usage: 100.0+ MB


In [6]:
# Converting wait_time to numeric
import pandas as pd
customer_df['wait_time'] = pd.to_numeric(customer_df['wait_time'])

In [7]:
customer_df.dtypes

city          object
date          object
wait_time    float64
dtype: object

#### Data Preparation for AutoML

In [8]:
timeseries_df = customer_df.copy()
timeseries_df

Unnamed: 0,city,date,wait_time
0,London,2018-06-28 17:46:00,6.2
1,San Diego,2021-06-18 20:47:00,7.6
2,London,2019-08-10 23:47:00,7.0
3,London,2021-11-04 02:59:00,8.4
4,New York City,2018-06-11 17:28:00,10.0
...,...,...,...
4369395,New York City,2020-11-11 20:06:00,8.2
4369396,New York City,2020-11-14 15:48:00,8.8
4369397,New York City,2020-11-07 05:06:00,8.6
4369398,New York City,2020-11-02 07:03:00,8.8


In [9]:
# Remove time dimension from the date column
timeseries_df['date'] = pd.to_datetime(timeseries_df['date'])
timeseries_df['date'] = pd.to_datetime(timeseries_df['date'].dt.date)

In [10]:
timeseries_df.head()

Unnamed: 0,city,date,wait_time
0,London,2018-06-28,6.2
1,San Diego,2021-06-18,7.6
2,London,2019-08-10,7.0
3,London,2021-11-04,8.4
4,New York City,2018-06-11,10.0


In [11]:
timeseries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4369400 entries, 0 to 4369399
Data columns (total 3 columns):
city         object
date         datetime64[ns]
wait_time    float64
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 100.0+ MB


In [12]:
# Grouping by city and date
timeseries_df_grouped = timeseries_df.groupby(['city','date'])['wait_time'].mean().reset_index()
timeseries_df_grouped = timeseries_df_grouped.sort_values(['city','date']).reset_index(drop=True)
timeseries_df_grouped

Unnamed: 0,city,date,wait_time
0,London,2017-12-16,7.800000
1,London,2017-12-17,8.500000
2,London,2017-12-18,8.200000
3,London,2017-12-19,8.075000
4,London,2017-12-20,7.760000
...,...,...,...
9033,Tokyo,2022-11-24,7.526702
9034,Tokyo,2022-11-25,7.532609
9035,Tokyo,2022-11-26,7.496133
9036,Tokyo,2022-11-27,7.752542


In [13]:
timeseries_df_grouped.dtypes

city                 object
date         datetime64[ns]
wait_time           float64
dtype: object

In [14]:
timeseries_df_grouped['wait_time'] = pd.to_numeric(timeseries_df_grouped['wait_time'])

In [15]:
timeseries_df_grouped

Unnamed: 0,city,date,wait_time
0,London,2017-12-16,7.800000
1,London,2017-12-17,8.500000
2,London,2017-12-18,8.200000
3,London,2017-12-19,8.075000
4,London,2017-12-20,7.760000
...,...,...,...
9033,Tokyo,2022-11-24,7.526702
9034,Tokyo,2022-11-25,7.532609
9035,Tokyo,2022-11-26,7.496133
9036,Tokyo,2022-11-27,7.752542


In [16]:
timeseries_df_grouped.dtypes

city                 object
date         datetime64[ns]
wait_time           float64
dtype: object

#### Split Data based on Cities

In [17]:
city_wise_dfs = {}

cities = list(timeseries_df_grouped['city'].unique())
for city in cities:
    city_df = timeseries_df_grouped[timeseries_df_grouped['city'] == city]
    city_wise_dfs[city] = city_df[['date', 'wait_time']]
    
city_wise_dfs['San Diego']

Unnamed: 0,date,wait_time
3616,2017-12-16,9.066667
3617,2017-12-17,8.266667
3618,2017-12-18,8.550000
3619,2017-12-19,8.413333
3620,2017-12-20,8.523077
...,...,...
5419,2022-11-24,8.057511
5420,2022-11-25,7.964187
5421,2022-11-26,7.925878
5422,2022-11-27,7.980814


#### Prepare Training and Testing set

In [18]:
city_wise_dfs

{'London':            date  wait_time
 0    2017-12-16   7.800000
 1    2017-12-17   8.500000
 2    2017-12-18   8.200000
 3    2017-12-19   8.075000
 4    2017-12-20   7.760000
 ...         ...        ...
 1803 2022-11-24   7.932405
 1804 2022-11-25   7.988251
 1805 2022-11-26   8.026506
 1806 2022-11-27   7.970914
 1807 2022-11-28   7.912903
 
 [1808 rows x 2 columns],
 'New York City':            date  wait_time
 1808 2017-12-16   8.600000
 1809 2017-12-17   8.422222
 1810 2017-12-18   8.088889
 1811 2017-12-19   8.210526
 1812 2017-12-20   8.100000
 ...         ...        ...
 3611 2022-11-24   7.653049
 3612 2022-11-25   7.705810
 3613 2022-11-26   7.800610
 3614 2022-11-27   7.574603
 3615 2022-11-28   7.734146
 
 [1808 rows x 2 columns],
 'San Diego':            date  wait_time
 3616 2017-12-16   9.066667
 3617 2017-12-17   8.266667
 3618 2017-12-18   8.550000
 3619 2017-12-19   8.413333
 3620 2017-12-20   8.523077
 ...         ...        ...
 5419 2022-11-24   8.057511
 5420 20

#### Split data based on time

In [19]:
# Filtering data before 2020-10-01
import pandas as pd
date_cutoff = pd.to_datetime('2022-10-01')

all_train_dfs = {}
for city, df in city_wise_dfs.items():
    train_df = df[df['date'] < date_cutoff]
    all_train_dfs[city] = train_df

all_train_dfs[city]

Unnamed: 0,date,wait_time
7232,2017-12-17,7.400000
7233,2017-12-18,9.300000
7234,2017-12-19,8.333333
7235,2017-12-20,7.800000
7236,2017-12-21,8.600000
...,...,...
8975,2022-09-26,7.883262
8976,2022-09-27,7.993846
8977,2022-09-28,7.678992
8978,2022-09-29,9.800000


In [20]:
# Filtering data after 2020-10-01
all_test_dfs = {}
for city, df in city_wise_dfs.items():
    test_df = df[df['date'] >= date_cutoff]
    all_test_dfs[city] = test_df
    
all_test_dfs['San Diego']

Unnamed: 0,date,wait_time
5365,2022-10-01,7.978072
5366,2022-10-02,7.997587
5367,2022-10-03,8.115976
5368,2022-10-04,7.981427
5369,2022-10-05,7.954632
5370,2022-10-06,8.018786
5371,2022-10-07,8.093366
5372,2022-10-08,8.014352
5373,2022-10-09,8.034949
5374,2022-10-10,7.987793


#### Upload training and testing set to the Storage Account

In [21]:
import os

local_data_folder = 'wait_time_data/'
if not os.path.exists(local_data_folder):
    os.mkdir(local_data_folder)

base_train_file = 'wait_time_data_train_'
base_test_file = 'wait_time_data_test_'

local_files = []
for city, train_df in all_train_dfs.items():
    city_without_spaces = '-'.join(city.split(' '))
  
    # Save train file
    train_file = base_train_file + city_without_spaces + '.csv'
    train_df.to_csv(local_data_folder + train_file, index=False)
    local_files.append(local_data_folder + train_file)
    
    # Save test file
    test_file = base_test_file + city_without_spaces + '.csv'
    test_df = all_test_dfs[city]
    test_df.to_csv(local_data_folder + test_file, index=False)
    local_files.append(local_data_folder + test_file)


### Set up AutoML Experiment

#### Set the Data Types for each column. 
This needs to be done explicitly since some ID columns are automatically inferred as integers, when they should be treated as strings

In [22]:
from azureml.data import DataType

data_types = {
#     'city': DataType.to_string(),
    'wait_time': DataType.to_long(),
    'date': DataType.to_datetime("%Y-%m-%d"),
}

print(len(data_types))

2


#### Load Training data from Storage Blob as a TabularDataSet

In [24]:
all_train_dfs.keys()

dict_keys(['London', 'Sydney', 'Tokyo', 'San-Diego', 'New-York-City'])

In [23]:
all_train_dfs['San-Diego'] = all_train_dfs['San Diego']
all_train_dfs['New-York-City'] = all_train_dfs['New York City']
del all_train_dfs['San Diego']
del all_train_dfs['New York City']

In [26]:
y_variable = "wait_time"

In [27]:
all_train_datasets.items()

dict_items([])

#### Setup Computer Instances

In [28]:
from azureml.core.compute import AmlCompute

compute = AmlCompute(ws, "mlw-retailprodcompute")

In [29]:
y_variable

'wait_time'

#### Configure the AutoML model and run it

In [30]:
# Initializing AutoML Config
from azureml.core.experiment import Experiment
from azureml.train.automl import AutoMLConfig

for city, traindataset in all_train_datasets.items():
    city_without_spaces = '-'.join(city.split(' '))
    experiment_name = 'Waittime-Forecasting-Experiment_' + city_without_spaces
    experiment = Experiment(ws, experiment_name)

    automl_config = AutoMLConfig(task = 'forecasting',
                         debug_log = 'automl_errors.log',
                         iteration_timeout_minutes = 15,
                         n_cross_validations=3,
                         experiment_timeout_minutes = 15,
                         label_column_name=y_variable,
                         time_column_name='date',
                         enable_early_stopping=True,
                         compute_target = compute,
                         training_data = traindataset,
                         model_explainability=True)

    training_run = experiment.submit(automl_config, show_output = False)

#### Retrieve model to predict the test set

In [None]:
# Setting workspace
import azureml.core
from azureml.core import Workspace, Datastore, Dataset, Experiment

ws = Workspace.from_config()
blob_datastore_name=gv.WAIT_TIME_DATASTORE_NAME
dstore = Datastore.get(ws, datastore_name=blob_datastore_name)
#ws_ds = ws.get_default_datastore()

print('Workspace Name: ' + ws.name, 
      'Resource Group: ' + ws.resource_group,
      'Default Storage Account Name: ' + dstore.account_name,
      'AzureML Core Version: ' + azureml.core.VERSION,
      sep = '\n')

In [32]:
autoMLRunIds = {
    'New-York-City': 'AutoML_7fad69d0-651f-49d6-8159-030baae67e82',
    'London': 'AutoML_50805a21-6ac7-41cf-818d-7b4c71f60de1',
    'Sydney': 'AutoML_571c2446-779a-4b35-8890-cdabf305286b',
    'San-Diego': 'AutoML_fc2aee46-d7a8-4308-bcb2-c9ca409b4f88',
    'Tokyo': 'AutoML_a01eb23b-8ea3-4bb0-b375-ce2adaee8e09',    
}

In [33]:
# Set and run AutoML experiment
from azureml.train.automl.run import AutoMLRun

all_automl_runs = {}
for city, autoMLRunId in autoMLRunIds.items():
    city_without_spaces = '-'.join(city.split(' '))
    experiment_name = 'Waittime-Forecasting-Experiment_' + city_without_spaces

    experiment = Experiment(workspace = ws, name = experiment_name)
    automl_run = AutoMLRun(experiment, autoMLRunId, outputs = None)
    display(automl_run)
    all_automl_runs[city] = automl_run

Experiment,Id,Type,Status,Details Page,Docs Page
Waittime-Forecasting-Experiment_New-York-City,AutoML_7fad69d0-651f-49d6-8159-030baae67e82,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation


Experiment,Id,Type,Status,Details Page,Docs Page
Waittime-Forecasting-Experiment_London,AutoML_50805a21-6ac7-41cf-818d-7b4c71f60de1,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation


Experiment,Id,Type,Status,Details Page,Docs Page
Waittime-Forecasting-Experiment_Sydney,AutoML_571c2446-779a-4b35-8890-cdabf305286b,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation


Experiment,Id,Type,Status,Details Page,Docs Page
Waittime-Forecasting-Experiment_San-Diego,AutoML_fc2aee46-d7a8-4308-bcb2-c9ca409b4f88,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation


Experiment,Id,Type,Status,Details Page,Docs Page
Waittime-Forecasting-Experiment_Tokyo,AutoML_a01eb23b-8ea3-4bb0-b375-ce2adaee8e09,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [34]:
# Choose best model
all_models = {}

for city, automl_run in all_automl_runs.items():
    best_run, fitted_model = automl_run.get_output()
    # print(fitted_model.steps)
    model_name = best_run.properties['model_name']
    print(model_name)
    all_models[city] = fitted_model

Package:azureml-automl-runtime, training version:1.38.0, current version:1.37.0
Package:azureml-core, training version:1.38.0, current version:1.37.0
Package:azureml-dataprep, training version:2.26.0, current version:2.25.0
Package:azureml-dataprep-rslex, training version:2.2.0, current version:2.1.0
Package:azureml-dataset-runtime, training version:1.38.0, current version:1.37.0
Package:azureml-defaults, training version:1.38.0, current version:1.37.0
Package:azureml-interpret, training version:1.38.0, current version:1.37.0
Package:azureml-mlflow, training version:1.38.0, current version:1.37.0
Package:azureml-pipeline-core, training version:1.38.0, current version:1.37.0
Package:azureml-responsibleai, training version:1.38.0, current version:1.37.0
Package:azureml-telemetry, training version:1.38.0, current version:1.37.0
Package:azureml-train-automl-client, training version:1.38.0, current version:1.37.0
Package:azureml-train-automl-runtime, training version:1.38.0, current version

AutoML7fad69d060


Package:azureml-automl-runtime, training version:1.38.0, current version:1.37.0
Package:azureml-core, training version:1.38.0, current version:1.37.0
Package:azureml-dataprep, training version:2.26.0, current version:2.25.0
Package:azureml-dataprep-rslex, training version:2.2.0, current version:2.1.0
Package:azureml-dataset-runtime, training version:1.38.0, current version:1.37.0
Package:azureml-defaults, training version:1.38.0, current version:1.37.0
Package:azureml-interpret, training version:1.38.0, current version:1.37.0
Package:azureml-mlflow, training version:1.38.0, current version:1.37.0
Package:azureml-pipeline-core, training version:1.38.0, current version:1.37.0
Package:azureml-responsibleai, training version:1.38.0, current version:1.37.0
Package:azureml-telemetry, training version:1.38.0, current version:1.37.0
Package:azureml-train-automl-client, training version:1.38.0, current version:1.37.0
Package:azureml-train-automl-runtime, training version:1.38.0, current version

AutoML50805a21620


Package:azureml-automl-runtime, training version:1.38.0, current version:1.37.0
Package:azureml-core, training version:1.38.0, current version:1.37.0
Package:azureml-dataprep, training version:2.26.0, current version:2.25.0
Package:azureml-dataprep-rslex, training version:2.2.0, current version:2.1.0
Package:azureml-dataset-runtime, training version:1.38.0, current version:1.37.0
Package:azureml-defaults, training version:1.38.0, current version:1.37.0
Package:azureml-interpret, training version:1.38.0, current version:1.37.0
Package:azureml-mlflow, training version:1.38.0, current version:1.37.0
Package:azureml-pipeline-core, training version:1.38.0, current version:1.37.0
Package:azureml-responsibleai, training version:1.38.0, current version:1.37.0
Package:azureml-telemetry, training version:1.38.0, current version:1.37.0
Package:azureml-train-automl-client, training version:1.38.0, current version:1.37.0
Package:azureml-train-automl-runtime, training version:1.38.0, current version

AutoML571c244670


Package:azureml-automl-runtime, training version:1.38.0, current version:1.37.0
Package:azureml-core, training version:1.38.0, current version:1.37.0
Package:azureml-dataprep, training version:2.26.0, current version:2.25.0
Package:azureml-dataprep-rslex, training version:2.2.0, current version:2.1.0
Package:azureml-dataset-runtime, training version:1.38.0, current version:1.37.0
Package:azureml-defaults, training version:1.38.0, current version:1.37.0
Package:azureml-interpret, training version:1.38.0, current version:1.37.0
Package:azureml-mlflow, training version:1.38.0, current version:1.37.0
Package:azureml-pipeline-core, training version:1.38.0, current version:1.37.0
Package:azureml-responsibleai, training version:1.38.0, current version:1.37.0
Package:azureml-telemetry, training version:1.38.0, current version:1.37.0
Package:azureml-train-automl-client, training version:1.38.0, current version:1.37.0
Package:azureml-train-automl-runtime, training version:1.38.0, current version

AutoMLfc2aee46d0


Package:azureml-automl-runtime, training version:1.38.0, current version:1.37.0
Package:azureml-core, training version:1.38.0, current version:1.37.0
Package:azureml-dataprep, training version:2.26.0, current version:2.25.0
Package:azureml-dataprep-rslex, training version:2.2.0, current version:2.1.0
Package:azureml-dataset-runtime, training version:1.38.0, current version:1.37.0
Package:azureml-defaults, training version:1.38.0, current version:1.37.0
Package:azureml-interpret, training version:1.38.0, current version:1.37.0
Package:azureml-mlflow, training version:1.38.0, current version:1.37.0
Package:azureml-pipeline-core, training version:1.38.0, current version:1.37.0
Package:azureml-responsibleai, training version:1.38.0, current version:1.37.0
Package:azureml-telemetry, training version:1.38.0, current version:1.37.0
Package:azureml-train-automl-client, training version:1.38.0, current version:1.37.0
Package:azureml-train-automl-runtime, training version:1.38.0, current version

AutoMLa01eb23b820


In [35]:
all_models['San-Diego']

ForecastingPipelineWrapper(pipeline=Pipeline(memory=None,
                                             steps=[('timeseriestransformer',
                                                     TimeSeriesTransformer(country_or_region=None, drop_column_names=[], featurization_config=FeaturizationConfig(
    blocked_transformers=None,
    column_purposes=None,
    transformer_params=None,
    dataset_language=None,
    drop_columns=None,
    prediction_transform_type=None
), force_time_...
    timeseries_param_dict={'time_column_name': 'date', 'grain_column_names': None, 'drop_column_names': [], 'overwrite_columns': True, 'dropna': False, 'transform_dictionary': {'min': '_automl_target_col', 'max': '_automl_target_col', 'mean': '_automl_target_col'}, 'max_horizon': 1, 'origin_time_colname': 'origin', 'country_or_region': None, 'n_cross_validations': 3, 'short_series_handling': True, 'max_cores_per_iteration': 1, 'feature_lags': None, 'target_aggregation_function': None, 'cv_step_size': None, 

#### Upload predictions to storage account

The test_df also contains the y_variable which needs to be dropped

In [36]:
# Filter out according to start and end date
X_test_df = pd.DataFrame({'date': pd.date_range(start='2022-10-01', end='2022-12-31')})
X_test_df

Unnamed: 0,date
0,2022-10-01
1,2022-10-02
2,2022-10-03
3,2022-10-04
4,2022-10-05
...,...
87,2022-12-27
88,2022-12-28
89,2022-12-29
90,2022-12-30


In [37]:
# Get predictions from all models
all_predictions = {}
for city, fitted_model in all_models.items():
    predictions = fitted_model.forecast(X_test_df)
    display(predictions)
    all_predictions[city] = predictions

(array([41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
        41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
        41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
        41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
        41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
        41, 41, 41, 41, 41, 41, 41]),
                                     _automl_target_col_WASNULL  _automl_year  \
 date       _automl_dummy_grain_col                                             
 2022-10-01 _automl_dummy_grain_col                           0          2022   
 2022-10-02 _automl_dummy_grain_col                           0          2022   
 2022-10-03 _automl_dummy_grain_col                           0          2022   
 2022-10-04 _automl_dummy_grain_col                           0          2022   
 2022-10-05 _automl_dummy_grain_col                           0          2022   
 ...           

(array([37.21243351, 37.20438507, 37.18691158, 37.18438464, 37.18846826,
        37.18079905, 37.18019141, 37.17828863, 37.17951547, 37.20331594,
        37.18215957, 37.18104146, 37.18685528, 37.1871063 , 37.17635915,
        37.17929385, 37.19778024, 37.20959672, 37.20865564, 37.21542482,
        37.20907496, 37.20144121, 37.20596953, 37.20045796, 37.16351065,
        37.16112455, 37.16158771, 37.16401045, 37.14663877, 37.07758528,
        37.08727184, 37.17657908, 37.17619275, 37.17302072, 37.18728181,
        37.17915299, 37.17929665, 37.18743428, 37.17388852, 37.17307048,
        37.16876513, 37.17129277, 37.16331909, 37.1632331 , 37.17513528,
        37.16704308, 37.15802033, 37.16360322, 37.19132885, 37.19014941,
        37.18984279, 37.19159629, 37.18900887, 37.17823153, 37.183097  ,
        37.15927025, 37.15267783, 37.14235727, 37.18476797, 37.16335596,
        37.14425553, 37.1898963 , 37.18541854, 37.14247659, 37.14157332,
        37.14607595, 37.15989121, 37.14760061, 37.1

(array([42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
        42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
        42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
        42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
        42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
        42, 42, 42, 42, 42, 42, 42]),
                                     _automl_target_col_WASNULL  _automl_year  \
 date       _automl_dummy_grain_col                                             
 2022-10-01 _automl_dummy_grain_col                           0          2022   
 2022-10-02 _automl_dummy_grain_col                           0          2022   
 2022-10-03 _automl_dummy_grain_col                           0          2022   
 2022-10-04 _automl_dummy_grain_col                           0          2022   
 2022-10-05 _automl_dummy_grain_col                           0          2022   
 ...           

(array([40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
        40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
        40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
        40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
        40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
        40, 40, 40, 40, 40, 40, 40]),
                                     _automl_target_col_WASNULL  _automl_year  \
 date       _automl_dummy_grain_col                                             
 2022-10-01 _automl_dummy_grain_col                           0          2022   
 2022-10-02 _automl_dummy_grain_col                           0          2022   
 2022-10-03 _automl_dummy_grain_col                           0          2022   
 2022-10-04 _automl_dummy_grain_col                           0          2022   
 2022-10-05 _automl_dummy_grain_col                           0          2022   
 ...           

(array([38.51588386, 38.3788362 , 38.16522509, 38.1455128 , 38.23232662,
        38.36128278, 38.30371857, 38.40347848, 38.374551  , 38.40294762,
        38.42202821, 38.27874494, 38.21611856, 38.36983759, 38.42017479,
        38.38992435, 38.28686845, 38.42423422, 38.35195025, 38.21118913,
        38.37407265, 38.38883065, 38.34666138, 38.20780161, 38.4757291 ,
        38.36663536, 38.22801175, 38.89500314, 39.12569802, 38.44117047,
        39.79910497, 39.09257291, 38.98684354, 39.15890349, 39.48182795,
        39.53959502, 39.3948216 , 39.31011587, 39.31089962, 39.29026208,
        39.33730685, 39.39883688, 39.34477177, 39.28678933, 39.31992435,
        39.22200471, 39.1223761 , 39.03965161, 39.03406774, 39.08923678,
        39.10519475, 39.36024577, 39.26283724, 39.14243409, 39.0892699 ,
        38.8790025 , 38.99394575, 39.07280292, 38.84680989, 40.60815272,
        40.17317991, 38.5944994 , 38.59034761, 38.61776217, 38.61494025,
        38.75498237, 38.81586434, 38.76249411, 38.6

In [38]:
predictions[0]

array([38.51588386, 38.3788362 , 38.16522509, 38.1455128 , 38.23232662,
       38.36128278, 38.30371857, 38.40347848, 38.374551  , 38.40294762,
       38.42202821, 38.27874494, 38.21611856, 38.36983759, 38.42017479,
       38.38992435, 38.28686845, 38.42423422, 38.35195025, 38.21118913,
       38.37407265, 38.38883065, 38.34666138, 38.20780161, 38.4757291 ,
       38.36663536, 38.22801175, 38.89500314, 39.12569802, 38.44117047,
       39.79910497, 39.09257291, 38.98684354, 39.15890349, 39.48182795,
       39.53959502, 39.3948216 , 39.31011587, 39.31089962, 39.29026208,
       39.33730685, 39.39883688, 39.34477177, 39.28678933, 39.31992435,
       39.22200471, 39.1223761 , 39.03965161, 39.03406774, 39.08923678,
       39.10519475, 39.36024577, 39.26283724, 39.14243409, 39.0892699 ,
       38.8790025 , 38.99394575, 39.07280292, 38.84680989, 40.60815272,
       40.17317991, 38.5944994 , 38.59034761, 38.61776217, 38.61494025,
       38.75498237, 38.81586434, 38.76249411, 38.64621086, 38.70

In [39]:
# Display wait times with date from predictions
predicted_dfs = {}

for city, predictions in all_predictions.items():
    df = X_test_df.copy()
    df['wait_time'] = predictions[0]
    df['wait_time'] = df['wait_time']/5.0
    display(df)
    predicted_dfs[city] = df

Unnamed: 0,date,wait_time
0,2022-10-01,8.20
1,2022-10-02,8.20
2,2022-10-03,8.20
3,2022-10-04,8.20
4,2022-10-05,8.20
...,...,...
87,2022-12-27,8.20
88,2022-12-28,8.20
89,2022-12-29,8.20
90,2022-12-30,8.20


Unnamed: 0,date,wait_time
0,2022-10-01,7.44
1,2022-10-02,7.44
2,2022-10-03,7.44
3,2022-10-04,7.44
4,2022-10-05,7.44
...,...,...
87,2022-12-27,7.42
88,2022-12-28,7.42
89,2022-12-29,7.41
90,2022-12-30,7.41


Unnamed: 0,date,wait_time
0,2022-10-01,8.40
1,2022-10-02,8.40
2,2022-10-03,8.40
3,2022-10-04,8.40
4,2022-10-05,8.40
...,...,...
87,2022-12-27,8.40
88,2022-12-28,8.40
89,2022-12-29,8.40
90,2022-12-30,8.40


Unnamed: 0,date,wait_time
0,2022-10-01,8.00
1,2022-10-02,8.00
2,2022-10-03,8.00
3,2022-10-04,8.00
4,2022-10-05,8.00
...,...,...
87,2022-12-27,8.00
88,2022-12-28,8.00
89,2022-12-29,8.00
90,2022-12-30,8.00


Unnamed: 0,date,wait_time
0,2022-10-01,7.70
1,2022-10-02,7.68
2,2022-10-03,7.63
3,2022-10-04,7.63
4,2022-10-05,7.65
...,...,...
87,2022-12-27,7.82
88,2022-12-28,7.87
89,2022-12-29,8.10
90,2022-12-30,8.16


#### Upload predictions to storage account

In [40]:
final_dfs = []

for city, predicted_df in predicted_dfs.items():
    print(city)
    train_df = all_train_dfs[city]
    final_df = pd.concat([train_df, predicted_df])
    city_list = [city]*len(final_df)
    final_df['city'] = city_list
    display(final_df)
    final_dfs.append(final_df)

New-York-City


Unnamed: 0,date,wait_time,city
1808,2017-12-16,8.60,New-York-City
1809,2017-12-17,8.42,New-York-City
1810,2017-12-18,8.09,New-York-City
1811,2017-12-19,8.21,New-York-City
1812,2017-12-20,8.10,New-York-City
...,...,...,...
87,2022-12-27,8.20,New-York-City
88,2022-12-28,8.20,New-York-City
89,2022-12-29,8.20,New-York-City
90,2022-12-30,8.20,New-York-City


London


Unnamed: 0,date,wait_time,city
0,2017-12-16,7.80,London
1,2017-12-17,8.50,London
2,2017-12-18,8.20,London
3,2017-12-19,8.07,London
4,2017-12-20,7.76,London
...,...,...,...
87,2022-12-27,7.42,London
88,2022-12-28,7.42,London
89,2022-12-29,7.41,London
90,2022-12-30,7.41,London


Sydney


Unnamed: 0,date,wait_time,city
5424,2017-12-16,7.20,Sydney
5425,2017-12-17,7.85,Sydney
5426,2017-12-18,9.00,Sydney
5427,2017-12-19,7.55,Sydney
5428,2017-12-20,8.13,Sydney
...,...,...,...
87,2022-12-27,8.40,Sydney
88,2022-12-28,8.40,Sydney
89,2022-12-29,8.40,Sydney
90,2022-12-30,8.40,Sydney


San-Diego


Unnamed: 0,date,wait_time,city
3616,2017-12-16,9.07,San-Diego
3617,2017-12-17,8.27,San-Diego
3618,2017-12-18,8.55,San-Diego
3619,2017-12-19,8.41,San-Diego
3620,2017-12-20,8.52,San-Diego
...,...,...,...
87,2022-12-27,8.00,San-Diego
88,2022-12-28,8.00,San-Diego
89,2022-12-29,8.00,San-Diego
90,2022-12-30,8.00,San-Diego


Tokyo


Unnamed: 0,date,wait_time,city
7232,2017-12-17,7.40,Tokyo
7233,2017-12-18,9.30,Tokyo
7234,2017-12-19,8.33,Tokyo
7235,2017-12-20,7.80,Tokyo
7236,2017-12-21,8.60,Tokyo
...,...,...,...
87,2022-12-27,7.82,Tokyo
88,2022-12-28,7.87,Tokyo
89,2022-12-29,8.10,Tokyo
90,2022-12-30,8.16,Tokyo


In [41]:
full_df = pd.concat(final_dfs)
print(full_df.shape)
full_df.to_csv('data/wait_time_forecasted.csv',index=False)

(9204, 3)


In [44]:
from azure.storage.blob import BlobClient

blob = BlobClient.from_connection_string(conn_str=gv.STORAGE_ACCOUNT_CONNECTION_STRING, container_name=gv.CONTAINER_NAME, blob_name="wait_time_forecastedv2.csv")

with open('data/wait_time_forecasted.csv', "rb") as data:
    blob.upload_blob(data)