In [81]:
from azureml.core import Workspace, Experiment, ScriptRunConfig, Environment, Dataset, Datastore, ComputeTarget, ScriptRunConfig
import os
import azureml.core
from azureml.pipeline.steps import PythonScriptStep,EstimatorStep
from azureml.pipeline.core import Pipeline
from azureml.data import OutputFileDatasetConfig
import pandas as pd
from azureml.data.datapath import DataPath
import azureml.mlflow
import mlflow
# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.20.0


In [2]:
# get workspace
workspace = Workspace.from_config()

# get compute target
compute_target = workspace.compute_targets['gandalf']

In [136]:
%%writefile conda_dependencies.yml

dependencies:
- python=3.6.8
- pip:
  - azureml-core==1.18.0.post1
  - azureml-defaults==1.18.0
  - azureml-telemetry==1.18.0
  - azureml-train-restclients-hyperdrive==1.18.0
  - azureml-train-core==1.18.0
  - cmake
  - torch
  - mkl
  - future
  - numpy
  - scikit-learn
  - pandas
  - matplotlib
  - torchtext
  - azureml-mlflow
  - mlflow
  - azureml-contrib-fairness
  - fairlearn

Overwriting conda_dependencies.yml


In [137]:
# From a Conda specification file
env = Environment.from_conda_specification(name = "PyTorch-NLP-GPU-V1",
                                             file_path = "conda_dependencies.yml")

In [138]:
# get reviews datastore (where all the output datasets will live)
datastore = Datastore.get(workspace, 'tacoreviews')
datastore

{
  "name": "tacoreviews",
  "container_name": "tacoreviews",
  "account_name": "haldatasets",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

### Upload dataset from directory

In [None]:
#spark or pandas dataframe
dataset_pandas = Dataset.Tabular.register_pandas_dataframe(pandas_df, danlp-sentiment-reviews-traintastore, "new_ds_from_pandas", show_progress=True)

#or upload from local direcotry
uploaded_directory = Dataset.File.upload_directory(src_dir='weather-data/', target=DataPath(adls_datastore, 'dv_lake_store_101/'), show_progress=True)

### Get dataset

In [6]:
tacoreviews_ds = Dataset.get_by_name(workspace, 'tacoreviews')

In [7]:
tacoreviews_ds

{
  "source": [
    "('tacoreviews', 'source/')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "74d497e8-b074-4870-9768-f01eca1b4f8c",
    "name": "tacoreviews",
    "version": 2,
    "description": "Sample reviews we might get for our restaurant!",
    "workspace": "Workspace.create(name='hal', subscription_id='91d27443-f037-45d9-bb0c-428256992df6', resource_group='robots')"
  }
}

## Process Step
1. Define OutputDataset
2. Define Script Run
3. Define Step

In [105]:
# Output Dataset
prep_ds = OutputFileDatasetConfig(destination=(datastore, 'prep/{run-id}')).register_on_complete(name='tacoreviewsprep')
prep_ds

<azureml.data.output_dataset_config.OutputFileDatasetConfig at 0x7fd18b3c5e48>

In [115]:
# data prep step configuration
prep_src = ScriptRunConfig(
    source_directory='.',
    script='prepare.py',
    compute_target=compute_target,
    environment=env
)

In [116]:
# data prep step
prepStep = PythonScriptStep(
    script_name=prep_src.script,
    name='prepare step',
    arguments=['--source_path', 
               tacoreviews_ds.as_named_input('tacoreviews').as_mount(),
               '--target_path', 
               prep_ds],
    #inputs=[tacoreviews_ds],
    #outputs=[prep_ds],
    runconfig=prep_src.run_config
)

# Train Step

In [117]:
# Output Dataset
train_ds = OutputFileDatasetConfig(destination=(datastore, 'train/{run-id}')).register_on_complete(name='tacoreviewstrain')
train_ds

<azureml.data.output_dataset_config.OutputFileDatasetConfig at 0x7fd18b40eb00>

In [118]:
# data prep step configuration
train_src = ScriptRunConfig(
    source_directory='.',
    script='train.py',
    compute_target=compute_target,
    environment=env
)

In [119]:
# data prep step
trainStep = PythonScriptStep(
    script_name=train_src.script,
    name='train step',
    arguments=['--source_path', 
               prep_ds.as_input(name='tacoreviewsprep').as_mount(),
               '--target_path', 
               train_ds],
    #inputs=[prep_ds],
    #outputs=[train_ds],
    runconfig=train_src.run_config
)

# Run Pipeline

### Prep and Train Pipeline

In [120]:
# build pipeline & run experiment
pipeline = Pipeline(workspace, steps=[prepStep, trainStep])

In [121]:
exp = Experiment(workspace, 'nlp-sentiment-reviews')
run = exp.submit(pipeline)

Created step prepare step [ff2ae02e][ae9d6d27-59e2-47b6-8069-d80858ebb6cd], (This step will run and generate new outputs)Created step train step [66f23390][e005cc40-a546-4d1a-b2c1-23a4b35e0731], (This step will run and generate new outputs)

Submitted PipelineRun 43560a07-24ba-431d-8e8b-39c206dc5981
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/nlp-sentiment-reviews/runs/43560a07-24ba-431d-8e8b-39c206dc5981?wsid=/subscriptions/91d27443-f037-45d9-bb0c-428256992df6/resourcegroups/robots/workspaces/hal


### Train Only Pipeline

In [143]:
prepared_reviews_ds = Dataset.get_by_name(workspace, name='tacoreviewsprep', version="6")

In [145]:
# data prep step
trainPipelineStep = PythonScriptStep(
    script_name=train_src.script,
    name='train step',
    arguments=['--source_path', 
               prepared_reviews_ds.as_mount(),
               '--target_path', 
               train_ds,
               '--epochs', 
               5,
               '--learning_rate',
               5.0,
               '--batch_size',
               16],
    #inputs=[prep_ds],
    #outputs=[train_ds],
    runconfig=train_src.run_config
)

In [146]:
train_pipeline = Pipeline(workspace, steps=[trainPipelineStep])

In [147]:
exp = Experiment(workspace, 'nlp-sentiment-reviews-train')
run = exp.submit(train_pipeline)

Created step train step [eaac105c][790d3a4b-1041-4b66-bd77-9b2beea90bb1], (This step will run and generate new outputs)
Submitted PipelineRun e6d1e426-09ea-44fa-85e4-74e9d90af3a9
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/nlp-sentiment-reviews-train/runs/e6d1e426-09ea-44fa-85e4-74e9d90af3a9?wsid=/subscriptions/91d27443-f037-45d9-bb0c-428256992df6/resourcegroups/robots/workspaces/hal


In [23]:
#run.wait_for_completion(show_output=True)