# Imports

In [None]:
from adal import AuthenticationContext
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core.experiment import Experiment
from azureml.pipeline.core import Pipeline, PipelineData, PublishedPipeline, StepSequence
from azureml.pipeline.core.schedule import Schedule
from azureml.pipeline.steps import PythonScriptStep, ParallelRunStep, ParallelRunConfig
import os
import requests
from utils import *

# Workspace Parameters

In [None]:
account_key = "" # Retrieved by the user from Storage Account -> Access Keys -> Show Keys -> key1 (Key)
blob_datastore_name = "" # Given by the user, the name of Datastore to be registered 
CONTAINER_NAME = "" # Given by the user, the Blob container name in the desired Storage account
dataset_type = "" # Given by the user,  either 'Tabular' or 'File'
local_path = os.getcwd() + "your_desired_local_path" # Given by the user, The local path to store the config_file if create_ws is True in setup_workspace()
LOCATION = "" # Given by the user, the desired location of Resource Group and AzureML workspace
RESOURCE_GROUP_NAME = "" # Given by the user, the Resource Group Name

# When using register_datastore(), STORAGE_ACCOUNT_NAME is required given by the user. 
# When using setup_workspace(), if you want to create the AzureML workspace, you can either provide your own Storage Account
# and associate it with the AzureML resource by setting use_my_storage = True and create_ws = True. 
# Note that you may need to add 'Contributor' and 'Storage Blob Data Contributor' roles to the AzureML workspace under Storage Account's Access Control
# If you don't set use_my_storage to True, AzureML workspace will create a default Storage Account.
# Otherwise, if you just retrieve the Workspace object, the STORAGE_ACCOUNT_NAME is not necessary
STORAGE_ACCOUNT_NAME = ""

subscription_id = "" # Given by the user
ws_name = "" # Given by the user

# Pipeline Parameters

In [None]:
BLOBNAME_train = "Training_Data.csv" # Given by the user, the path to the file, including any subdirectories and filename
BLOBNAME_test = "Test_Data.csv" # # Given by the user, the path to the file, including any subdirectories and filename
compute_name = "computeMarshall" # Given by the user, the name of the ComputeTarget
compute_type = "cluster" # Given by the user, the type of the ComputeTarget, either Compute Instance ('instance') or Compute Cluster ('cluster')

# Given by the user, the encoding of TabularDataset object (eg: utf-8, iso88591 etc.)
# Supported encodings are 'utf8', 'iso88591', 'latin1', 'ascii', 'utf16', 'utf32', 'utf8bom' and 'windows1252'
encoding = 'iso88591' 

env_name = "TestMarshallEnv" # Given by the user, the Name of the Environment to be used in the pipeline runs
features = "Features"
model_name = "Model"
pipeline_name = "train-pipeline-test"
pip_packages = ['pandas', 'scikit-learn', 'azureml-sdk', 'nltk', 'xgboost', 'azureml-dataset-runtime[fuse,pandas]'] # Given by the user, the list of pip packages for the custom Environment
prep_inference_data = 'test_dtm.csv'
prep_training_data = 'train_dtm.csv'
train_dataset_name = 'train_ds' # Given by the user, the desired name of the registered Dataset
test_dataset_name = 'test_ds' # Given by the user, the desired name of the registered Dataset
vm_size = 'STANDARD_D1_V2' # Given by the user, the size of virtual machine

# Steps

## Setup ML workspace

In [None]:
ws = setup_workspace(local_path, LOCATION, RESOURCE_GROUP_NAME, 
                     STORAGE_ACCOUNT_NAME, subscription_id, ws_name)
ws

## Register Datastore (optional)

In [None]:
register_datastore(account_key, blob_datastore_name, CONTAINER_NAME,
                   STORAGE_ACCOUNT_NAME, ws)

## Register Datasets (optional)

In [None]:
register_dataset(blob_datastore_name, BLOBNAME_train, BLOBNAME_test, dataset_type,
                 encoding, test_dataset_name, train_dataset_name, ws)

## Retrieve datasets

In [None]:
train_ds, test_ds = retrieve_dataset(test_dataset_name, train_dataset_name, ws)

## Prepare Compute Target and Pipeline configuration

In [None]:
compute_target, run_config, environment = prepare_pipeline(compute_name, compute_type, env_name, 
                                                           vm_size, ws, pip_packages=pip_packages)

## Training Pipeline

#### prep_data is output of the 1st step and input to the 2nd step

In [None]:
prep_data = PipelineData('prep_train_data', datastore=ws.get_default_datastore())

In [None]:
dataprep_step = PythonScriptStep(
    script_name="data_prep.py",
    name = '01 Data Preprocessing',
    allow_reuse=False,
    arguments=["--datafolder", prep_data,
               "--input", train_dataset_name,
               "--output_name", prep_training_data,
               "--process", "training"],
    inputs=[train_ds.as_named_input(train_dataset_name)],
    outputs=[prep_data],
    compute_target=compute_target,
    runconfig=run_config,
    source_directory=os.getcwd() + '/src/AzureML_NonSpark'
)

In [None]:
train_step1 = PythonScriptStep(
    script_name="train.py",
    name = '02 Training',
    allow_reuse=False,
    arguments=["--datafolder", prep_data,
               "--input_filename", prep_training_data],
    inputs=[prep_data],
    compute_target=compute_target,
    runconfig=run_config,
    source_directory=os.getcwd() + '/src/AzureML_NonSpark'
)

In [None]:
train_step2 = PythonScriptStep(
    script_name="train.py",
    name = '02 Training',
    allow_reuse=False,
    arguments=["--datafolder", prep_data,
               "--input_filename", prep_training_data],
    inputs=[prep_data],
    compute_target=compute_target,
    runconfig=run_config,
    source_directory=os.getcwd() + '/src/AzureML_NonSpark'
)

In [None]:
log_step = PythonScriptStep(
    script_name="log.py",
    name = 'Logging Run Status',
    arguments=["--process", "Training",
               "--outputfolder", "outputs"],
    allow_reuse=False,
    compute_target=compute_target,
    runconfig=run_config,
    source_directory=os.getcwd() + '/src/AzureML_NonSpark' 
)

In [None]:
experiment = Experiment(workspace=ws, name="train-pipeline-december-no-spark")
step_sequence = StepSequence(steps=[dataprep_step, train_step, log_step])
train_pipeline = Pipeline(workspace=ws, steps=step_sequence)
train_pipeline_run = experiment.submit(train_pipeline, continue_on_step_failure=True)
train_pipeline_run.wait_for_completion(show_output=True)

## Publish Pipeline

In [None]:
published_pipeline = train_pipeline.publish(name="Training - December",
                                            description="Model training pipeline",
                                            version="1.0",
                                            continue_on_step_failure=True)

In [None]:
rest_endpoint = published_pipeline.endpoint
print(rest_endpoint)

## Get published pipeline and enable

In [None]:
# Get the pipeline by using its ID from Azure Machine Learning studio
pipeline_id = "ffc7afa7-10d6-4de2-87ef-a438c9b42c49"
p = PublishedPipeline.get(ws, id=pipeline_id)
# p.enable()
# p.disable()

In [None]:
p.endpoint

## Reactive schedule 

In [None]:
published_pipelines = PublishedPipeline.list(ws)
for published_pipeline in published_pipelines:
    pipeline_id = published_pipeline.id
    print(f"{published_pipeline.name},'{published_pipeline.id}'")

In [None]:
datastore = Datastore(workspace=ws, name="test_datastore")
experiment_name = "train-pipeline-december-no-spark"
path_on_datastore = ""

In [None]:
reactive_schedule = Schedule.create(ws, name="MyReactiveSchedule", description="Based on input file change.",
                                    pipeline_id=pipeline_id, experiment_name=experiment_name, datastore=datastore, 
                                    continue_on_step_failure=True, path_on_datastore=path_on_datastore,
                                    polling_interval=1)

In [None]:
reactive_schedule

In [None]:
reactive_schedule.disable()

### Note you cannot disable a published pipeline that has an active Schedule. You must first disable the schedule and then the pipeline.

#### List all schedules of workspace

In [None]:
Schedule.list(ws)[0].disable()

# Run a published pipeline through REST API

In [None]:
interactive_auth = InteractiveLoginAuthentication()

auth_header = interactive_auth.get_authentication_header()
auth_header

In [None]:
response = requests.post(p.endpoint,
                         json={"ExperimentName": "train-pipeline-december-no-spark"},
                         headers=auth_header
                         )
print(response.json())

### Retrieve Pipeline run status through REST

In [None]:
runId = "4021514c-a44f-4ce9-a50b-68bc5e4765a0"
experiment_name = "train-pipeline-december-no-spark"

In [None]:
get_endpoint = f'https://{LOCATION}.experiments.azureml.net/history/v1.0/subscriptions/{subscription_id}/resourceGroups/{RESOURCE_GROUP_NAME}/providers/Microsoft.MachineLearningServices/workspaces/{ws_name}/experiments/{experiment_name}/runs/{runId}/details'

In [None]:
response_get = requests.get(get_endpoint,
                         headers=auth_header
                         )


response_get.json()['status']

## Inference Pipeline

#### prep_inf_data is output of the 1st step and input of the 2nd step

In [None]:
prep_inf_data = PipelineData('prep_inf_data', datastore=ws.get_default_datastore())

In [None]:
dataprep_step = PythonScriptStep(
    script_name="data_prep.py",
    name = '01 Data Preprocessing',
    arguments=["--datafolder", prep_inf_data,
               "--input", test_dataset_name,
               "--output_name", prep_inference_data,
               "--process", "inference"],
    inputs=[test_ds.as_named_input(test_dataset_name)],
    outputs=[prep_inf_data],
    compute_target=compute_target,
    runconfig=run_config,
    source_directory=os.getcwd() + '/AzureML_NonSpark'
)

In [None]:
inference_step = PythonScriptStep(
    script_name="inference.py",
    name = '02 Predict',
    arguments=["--model_name", model_name,
               "--datafolder", prep_inf_data,
               "--features", features,
               "--input_filename", prep_inference_data],
    inputs=[prep_inf_data],
    compute_target=compute_target,
    runconfig=run_config,
    source_directory=os.getcwd() + '/AzureML_NonSpark'
)

In [None]:
log_step = PythonScriptStep(
    script_name="log.py",
    name = 'Logging Run Status',
    arguments=["--process", "Inference",
               "--outputfolder", "outputs"],
    allow_reuse=False,
    compute_target=compute_target,
    runconfig=run_config,
    source_directory=os.getcwd() + '/AzureML_NonSpark'
)

In [None]:
experiment = Experiment(workspace=ws, name='inference-pipeline-test')
step_sequence = StepSequence(steps=[dataprep_step, inference_step, log_step])
inference_pipeline = Pipeline(workspace=ws, steps=step_sequence)
inference_pipeline_run = experiment.submit(inference_pipeline, continue_on_step_failure=True)
inference_pipeline_run.wait_for_completion(show_output=True)

In [None]:
published_pipeline = inference_pipeline.publish(name="Inference",
                                            description="Model inference pipeline",
                                            version="1.0",
                                            continue_on_step_failure=True)

In [None]:
rest_endpoint = published_pipeline.endpoint
print(rest_endpoint)