# Importing Libraries

In [1]:
import logging
import os 
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig 
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

#Check core SDK version number
print("SDK Version:", azureml.core.VERSION)


SDK Version: 1.37.0


## Initialize Workspace
Initialize a workspace object from persisted configuration. Make sure the config file is present at .\config.json

In [2]:
ws=Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep ='\n')

quick-starts-ws-182668
aml-quickstarts-182668
southcentralus
9a7511b8-150f-4a58-8528-3e7d50216c31


## Create an Azure ML experiment
Let's create an experiment named "automlstep-classification" and a folder to hold the training scripts. The script runs will be recorded under the experiment in Azure.

The best practice is to use separate folders for scripts and its dependent files for each step and specify that folder as the `source_directory` for the step. This helps reduce the size of the snapshot created for the step (only the specific folder is snapshotted). Since changes in any files in the `source_directory` would trigger a re-upload of the snapshot, this helps keep the reuse of the step when there are no changes in the `source_directory` of the step.

*Udacity Note:* There is no need to create an Azure ML experiment, this needs to re-use the experiment that was already created

In [3]:
# Choose a name for the run history container in the workspace
#NOTE: update these to match your existing experiment name. 

experiment_name = 'ml-bike-experiment-1'
project_folder='./pipelineAD'

experiment =Experiment(ws, experiment_name)
experiment


Name,Workspace,Report Page,Docs Page
ml-bike-experiment-1,quick-starts-ws-182668,Link to Azure Machine Learning studio,Link to Documentation


### Create or Attach an AmlCompute cluster
You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for your AutoML run. In this tutorial, you get the default `AmlCompute` as your training compute resource.

**Udacity Note** There is no need to create a new compute target, it can re-use the previous cluster

In [4]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException


# NOTE: update the cluster name to match the existing cluster
# Choose a name for your CPU cluster

amlcompute_cluster_name="auto-ml"
# Verify that cluster does not exist already

try: 
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config=AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                        max_nodes=4)
    compute_target=ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count =1, timeout_in_minutes=20)

# For a more detailed view of current AmlCompute status, use get_status().

Found existing cluster, use it.
Succeeded.....................................................................................................................................................................................................................................
AmlCompute wait for completion finished

Wait timeout has been reached
Current provisioning state of AmlCompute is "Succeeded" and current node count is "0"


## Data

**Udacity note:** Make sure the `key` is the same name as the dataset that is uploaded, and that the description matches. If it is hard to find or unknown, loop over the `ws.datasets.keys()` and `print()` them.
If it *isn't* found because it was deleted, it can be recreated with the link that has the CSV 


In [5]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
# NOTE: update the key to match the dataset name

found=False
key = "Bikeshare-AD"
description_text= " AD dataset "


if key in ws.datasets.keys():
    found =True
    dataset = ws.datasets[key]

if not found:
    #Create AML dataset and register it into workspace
    example_data='https://raw.githubusercontent.com/Azure/MachineLearningNotebooks/master/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/bike-no.csv'
    dataset= Dataset.Tabular.from_delimited_files(example_data)
    #Register Dataset in Workspace
    dataset=dataset.register (workspace=ws, 
                              name=key, 
                              description=description_text)

df=dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,instant,season,yr,mnth,weekday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0
mean,366.0,2.49658,0.500684,6.519836,2.997264,1.395349,0.495385,0.474354,0.627894,0.190486,848.176471,3656.172367,4504.348837
std,211.165812,1.110807,0.500342,3.451913,2.004787,0.544894,0.183051,0.162961,0.142429,0.077498,686.622488,1560.256377,1937.211452
min,1.0,1.0,0.0,1.0,0.0,1.0,0.05913,0.07907,0.0,0.022392,2.0,20.0,22.0
25%,183.5,2.0,0.0,4.0,1.0,1.0,0.337083,0.337842,0.52,0.13495,315.5,2497.0,3152.0
50%,366.0,3.0,1.0,7.0,3.0,1.0,0.498333,0.486733,0.626667,0.180975,713.0,3662.0,4548.0
75%,548.5,3.0,1.0,10.0,5.0,2.0,0.655417,0.608602,0.730209,0.233214,1096.0,4776.5,5956.0
max,731.0,4.0,1.0,12.0,6.0,3.0,0.861667,0.840896,0.9725,0.507463,3410.0,6946.0,8714.0


### Review the Dataset Result

You can peek the result of a TabularDataset at any range using `skip(i)` and `take(j).to_pandas_dataframe()`. Doing so evaluates only `j` records for all the steps in the TabularDataset, which makes it fast even against large datasets.

`TabularDataset` objects are composed of a list of transformation steps (optional).

In [6]:
dataset.take(5).to_pandas_dataframe()

Unnamed: 0,instant,date,season,yr,mnth,weekday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,6,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,2,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,3,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


## Train
This creates a general AutoML settings object.
**Udacity notes:** These inputs must match what was used when training in the portal. `time_column_name` has to be `cnt` for example

In [8]:
automl_settings={
    "experiment_timeout_minutes":20,
    "max_concurrent_iterations":5,
    " primacy_metric": 'normalized_root_mean_squared_error',
    "n_cross_validations":5
}
automl_config = AutoMLConfig(compute_target=compute_target,
                              task="forecasting",
                              training_data=dataset, 
                              time_column_name="date",
                              label_colume_name='cnt',
                              path = project_folder, 
                              enable_early_stopping = True, 
                              #featurization='auto',
                              debug_log ="automl_errors.log",
                              **automl_settings
                              )


#### Create Pipeline and AutoMLStep

You can define outputs for the AutoMLStep using TrainingOutput.

In [10]:

from azureml.pipeline.core import PipelineData, TrainingOutput  

ds=ws.get_default_datastore()
metrics_output_name = 'metrics_output'
best_model_output_name = 'best_model_output',

metrics_data = PipelineData(name='metrics_data',
                            datastore=ds,
                            pipeline_output_name= metrics_output_name,
                            training_output= TrainingOutput(type='Metrics'))

model_data= PipelineData(name='model_data',
                            datastore=ds, 
                            pipeline_output_name=best_model_output_name,
                            training_output=TrainingOutput(type='Model')) 
