# Walgreens Boots Alliance Demo

## Setup your environment

Before running any cells, make sure you have:
* Created your conda environment as per the README.md, 
* Selected your new conda environment to run this notebook (from the Kernel menu).
* Updated the values as indicated in the cell below.

In [None]:
# Global constants
subscription_id = '<insert your Azure subscription ID here>' 
resource_group  = '<insert the name of the resource group that holds your AML workspace here>'
workspace_name  = '<insert your AML workspace name here>'
experiment_name = 'category-based-propensity'
cluster_name = 'wba-cluster'
project_folder = 'scripts'

In [None]:
# Key open source data analysis packages
import json
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

%matplotlib inline
sns.set(color_codes='True')

In [None]:
# Working directories
if not os.path.exists(project_folder):
    os.makedirs(project_folder)

## Exploring our dataset

Let's take a look at our dataset before building our models.

In [None]:
# Read the data in to analyze
df = pd.read_csv('./data.csv')

In [None]:
# Preview available columns
df.head(10)

In [None]:
# Analyze distribution of ages in the dataset
sns.distplot(df[['AGE']], bins=[10,20,30,40,50,60,70,80,90,100])

In [None]:
# Analyze distribution of spend in category #1
sns.distplot(df[('CATEGORY_1_SPEND')])

In [None]:
# Analyze how age influences whether customers have responded to category #1 campaigns
g = sns.FacetGrid(df, col='BOUGHT_CATEGORY_1')
g.map(sns.distplot, 'AGE')

In [None]:
# Analyze how gender influences whether customers have responded to category #1 campaigns
g = sns.FacetGrid(df, col='BOUGHT_CATEGORY_1')
g.map(sns.countplot, 'GENDER')

In [None]:
# Analyze how age and category #1 & #2 spend influences responding to category #1 campaigns
sns.pairplot(df[['AGE', 'CATEGORY_1_SPEND', 'CATEGORY_2_SPEND', 'BOUGHT_CATEGORY_1']], hue='BOUGHT_CATEGORY_1')

## Setup Azure Machine Learning

Let's connect, provision our compute, and upload our data.

In [None]:
# Setup Azure Machine Learning
from azureml.core import Run
from azureml.core.compute import AksCompute, ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.container_registry import ContainerRegistry
from azureml.core.experiment import Experiment
from azureml.core.runconfig import DataReferenceConfiguration, RunConfiguration
from azureml.core.webservice import AciWebservice
from azureml.core.workspace import Workspace
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline, PipelineData, PublishedPipeline, PipelineRun, Schedule, TrainingOutput
from azureml.pipeline.core.graph import PipelineParameter
from azureml.pipeline.steps import PythonScriptStep
from azureml.train.automl import AutoMLConfig, AutoMLStep
from azureml.train.automl.automlexplainer import retrieve_model_explanation
from azureml.train.automl.run import AutoMLRun
from azureml.widgets import RunDetails

import azureml

# Connect to Azure Machine Learning
try:
    ws = Workspace.from_config()
except:
    ws = Workspace(subscription_id = subscription_id,
                   resource_group = resource_group,
                   workspace_name = workspace_name)
    ws.write_config()
    
    print('Workspace config file written')
    
output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(data=output, index=['']).T

In [None]:
# Provision a compute target
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',
                                                           min_nodes=0,
                                                           max_nodes=4)

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    compute_target.wait_for_completion(show_output=True)

compute_target.status.serialize()

In [None]:
# Upload our data
ds = ws.get_default_datastore()
ds.upload_files(['./data.csv'], target_path = 'boots', overwrite=True)

## Experiment with Automated ML

Let's submit a training run using our data and Automated ML.

In [None]:
%%writefile $project_folder/get_data.py

import pandas as pd
from sklearn.preprocessing import LabelEncoder

def get_data():
    df = pd.read_csv('/tmp/azureml_runs/boots/data.csv')

    le = LabelEncoder()
    le.fit(df['BOUGHT_CATEGORY_1'].values)
    y = le.transform(df['BOUGHT_CATEGORY_1'].values)

    df = df.drop(['BOUGHT_CATEGORY_1'], axis=1)

    return { "X" : df, "y" : y }

In [None]:
# Define the experiment
experiment = Experiment(ws, experiment_name)

In [None]:
# Create our run configuration including our data source reference and base image configuration
dr = DataReferenceConfiguration(datastore_name=ds.name, 
                                path_on_compute='/tmp/azureml_runs',
                                path_on_datastore='boots',
                                mode='download',
                                overwrite=False)

run_config = RunConfiguration(framework="python")
run_config.target = compute_target
run_config.data_references = {ds.name: dr}
run_config.environment.docker.enabled = True
run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE
run_config.environment.python.conda_dependencies = CondaDependencies.create(pip_packages=['azureml-sdk[automl]','azureml-explain-model'], conda_packages=['numpy','py-xgboost<=0.80'])

automl_config = AutoMLConfig(task = 'classification',
                             iterations = 25,
                             iteration_timeout_minutes = 5, 
                             max_cores_per_iteration = 2,
                             max_concurrent_iterations = 4,
                             primary_metric = 'accuracy',
                             data_script = project_folder + '/get_data.py',
                             run_configuration = run_config,
                             path = project_folder,
                             n_cross_validations = 2,
                             preprocess = True)

In [None]:
remote_run = experiment.submit(automl_config, show_output=False)
remote_run

In [None]:
remote_run.wait_for_completion(show_output=True)

# Review our results

Once the experiment completes, let's review the results.

In [None]:
# Run details capture configuration and exact Git commit used for the run
remote_run_df = pd.read_json('[' + json.dumps(remote_run.get_details()['properties']) + ']', orient='columns')
remote_run_df[['azureml.git.branch','azureml.git.commit','azureml.git.repository_uri']].T

In [None]:
# Easily explore results using interactive widgets
RunDetails(remote_run).show()

In [None]:
# Programmatically find the best model based on different metrics
lookup_metric = 'accuracy'
best_run, fitted_model = remote_run.get_output(metric = lookup_metric)
print(best_run)
print(fitted_model)

# Publish our model

Once we've selected our preferred model, we can register it for management (and optional deployment).

In [None]:
# Store the preferred model for your team to use
model = best_run.register_model(model_name = 'category_1_model.pkl',
                                model_path = 'outputs/model.pkl',
                                tags = {'area': 'CATEGORY 1', 'type': 'classification'})
print(model.name, model.version)

In [None]:
# Captures training details
model.serialize()

# Build a pipeline

We can build an AML pipeline to make our experiment easy to re-run as data changes.

In [None]:
%%writefile $project_folder/register.py

from azureml.core import Workspace
from azureml.core.model import Model
from azureml.core import Run
import argparse
import json

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_name',
        type=str,
        default='',
        help='Variant name you want to give to the model.'
    )
    parser.add_argument(
        '--model_path',
        type=str,
        default='outputs',
        help='Location of trained model.'
    )

    args, unparsed = parser.parse_known_args()
    print(args.model_name)
    print(args.model_path)
    
    run = Run.get_context()
    ws = run.experiment.workspace
    
    tags = {
        "runId": str(run.id)
    }

    print(json.dumps(tags))

    model = Model.register(ws, model_name = args.model_name, model_path = args.model_path, tags=tags)

    print('Model registered: {} \nModel Description: {} \nModel Version: {}'.format(model.name, model.description, model.version))

In [None]:
# Re-use our experiment configuration
input_data = DataReference(datastore=ds, 
                           data_reference_name='training_data',
                           path_on_datastore='boots',
                           mode='download',
                           path_on_compute='/tmp/azureml_runs',
                           overwrite=True)

run_config = RunConfiguration(framework="python")
run_config.environment.docker.enabled = True
run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE
run_config.environment.python.conda_dependencies = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80'])

In [None]:
# Build a pipeline
steps = []

# These are the two outputs from AutoML
metrics_data = PipelineData(name='metrics_data_category_1',
                            datastore=ds,
                            pipeline_output_name='metrics_output_category_1',
                            training_output=TrainingOutput(type='Metrics'))

model_data = PipelineData(name='model_data_category_1'.format(cat),
                          datastore=ds,
                          pipeline_output_name='best_model_output_category_1',
                          training_output=TrainingOutput(type='Model'))

# AutoML config (note different data files for each model so it's not shared)
automl_config = AutoMLConfig(task = 'classification',
                             iterations = 25,
                             iteration_timeout_minutes = 5, 
                             max_cores_per_iteration = 2,
                             max_concurrent_iterations = 4,
                             primary_metric = 'accuracy',
                             data_script = '{}/get_data.py',
                             run_configuration = run_config,
                             compute_target = compute_target,
                             path = project_folder,
                             n_cross_validations = 2,
                             preprocess = True)

# AutoML action
automl_step = AutoMLStep(name='automl_module_category_1',
                         automl_config=automl_config,
                         inputs=[input_data],
                         outputs=[metrics_data, model_data],
                         allow_reuse=False)

# Custom script action to register the model afterwards
register_step = PythonScriptStep(name='register_category_1',
                                 script_name='register.py',
                                 compute_target=compute_target,
                                 source_directory=project_folder,
                                 arguments=['--model_name', 'category_1_model.pkl', '--model_path', model_data],
                                 inputs=[model_data],
                                 allow_reuse=False)

pipeline = Pipeline(description='Generate recommendation models',
                    workspace=ws,
                    steps=[automl_step, register_step])

pipeline.validate()

# Once published, we can invoke on demand via the SDK or via a REST endpoint
published_pipeline = pipeline.publish(name='category-based-propensity-pipeline')

## Schedule our pipeline

Now that our experiment is available as a pipeline, we can schedule it or run it on demand.

In [None]:
# Automatically run our pipeline when the data changes
schedule = Schedule.create(workspace=ws,
                           name='category-based-propensity-schedule',
                           pipeline_id=published_pipeline.id, 
                           experiment_name='category-based-propensity-schedule',
                           datastore=ds,
                           path_on_datastore='boots',
                           wait_for_provisioning=True,
                           polling_interval=1,
                           description='Scheduled run of category-based-propensity')

In [None]:
# Or, run it on demand
published_pipeline.submit(ws, published_pipeline.name)