# Create and orchestrate NLP workflow using SageMaker Pipelines 

## 1. Setup

In [1]:
%%capture

!pip install --upgrade sagemaker

### Imports 

In [2]:
from sagemaker.workflow.parameters import ParameterInteger, ParameterFloat, ParameterString
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.huggingface import HuggingFace
from sagemaker.inputs import TrainingInput
from time import gmtime, strftime
import pandas as pd
import sagemaker
import logging
import boto3
import json
import os

In [3]:
logger = logging.getLogger('__name__')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [4]:
logger.info(f'Using SageMaker: {sagemaker.__version__}')

Using SageMaker: 2.49.0


In [5]:
session = sagemaker.Session()
bucket = session.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
s3_client = boto3.client('s3', region_name=region)

current_timestamp = strftime('%m-%d-%H-%M', gmtime())
pipeline_name = f'nlp-pipeline-{current_timestamp}'

In [6]:
logger.info(f'Bucket name = {bucket}')
logger.info(f'Region = {region}')
logger.info(f'Role = {role}')

Bucket name = sagemaker-us-east-1-892313895307
Region = us-east-1
Role = arn:aws:iam::892313895307:role/service-role/AmazonSageMaker-ExecutionRole-20210714T091788


## 2. Define pipeline parameters 

In [7]:
training_instance_count = ParameterInteger(name='TrainingInstanceCount', default_value=1)
training_instance_type = ParameterString(name='TrainingInstanceType', default_value='ml.p3.2xlarge')
deployment_instance_count = ParameterInteger(name='DeploymentInstanceCount', default_value=2)
deployment_instance_type = ParameterString(name='DeploymentInstanceType', default_value='ml.m5.4xlarge')
trained_model_s3_uri = ParameterString(name='TrainedModelS3Uri', default_value=f's3://{bucket}/pipeline/model')

## 3. Define training step

In [8]:
hyperparameters={'epochs': 1,
                 'train_batch_size': 16,
                 'model_name':'distilbert-base-uncased',
                 'model_s3': trained_model_s3_uri.default_value,
                 'output_dir':'/opt/ml/checkpoints'}

In [9]:
huggingface_estimator = HuggingFace(entry_point='train.py',
                            source_dir='./src',
                            instance_type=training_instance_type.default_value,
                            instance_count=training_instance_count.default_value,
                            role=role,
                            transformers_version='4.6',
                            tensorflow_version='2.4',
                            py_version='py37',  
                            disable_profiler=True,
                            debugger_hook_config=False,
                            # model_dir=trained_model_s3_uri.default_value,
                            # output_dir=trained_model_s3_uri.default_value,
                            # output_path=trained_model_s3_uri.default_value,
                            checkpoint_s3_uri=trained_model_s3_uri.default_value,
                            hyperparameters=hyperparameters)

In [10]:
train_step = TrainingStep(name='train', 
                          estimator=huggingface_estimator)

TrainingStep(name='train', step_type=<StepTypeEnum.TRAINING: 'Training'>, depends_on=None)

## 4. Define processing step for model deployment

In [11]:
endpoint_name = f'hf-clf-{current_timestamp}'
deploy_model_script_uri = f's3://{bucket}/pipeline/code/deploy.py'

In [12]:
s3_client.upload_file(Filename='./src/deploy.py', Bucket=bucket, Key='pipeline/code/deploy.py')

In [13]:
deploy_model_processor = SKLearnProcessor(framework_version='0.23-1', 
                                          role=role, 
                                          instance_type='ml.t3.medium', 
                                          instance_count=1, 
                                          base_job_name='deploy-processing-job', 
                                          sagemaker_session=session)

In [14]:
deploy_step = ProcessingStep(name='deploy', 
                             depends_on=[train_step.name],
                             processor=deploy_model_processor, 
                             job_arguments=['--model_name', endpoint_name, # reuse endpoint name 
                                            '--region', region, 
                                            '--deployment_instance_type', deployment_instance_type, 
                                            '--deployment_instance_count', str(deployment_instance_count), 
                                            '--model_s3_path', trained_model_s3_uri.default_value, 
                                            '--endpoint_name', endpoint_name], 
                             code=deploy_model_script_uri)

In [15]:
custom_dependencies = deploy_step.depends_on
custom_dependencies

['train']

## 5. Create Pipeline

In [16]:
pipeline = Pipeline(name=pipeline_name, 
                    parameters=[training_instance_type,
                                training_instance_count,
                                deployment_instance_type,
                                deployment_instance_count,
                                trained_model_s3_uri], 
                    steps=[train_step, deploy_step], 
                    sagemaker_session=session)

In [18]:
definition = json.loads(pipeline.definition())
definition

{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'TrainingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.p3.2xlarge'},
  {'Name': 'TrainingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'DeploymentInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.4xlarge'},
  {'Name': 'DeploymentInstanceCount', 'Type': 'Integer', 'DefaultValue': 2},
  {'Name': 'TrainedModelS3Uri',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-us-east-1-892313895307/pipeline/model'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'train',
   'Type': 'Training',
   'Arguments': {'AlgorithmSpecification': {'TrainingInputMode': 'File',
     'TrainingImage': '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-tensorflow-training:2.4-transformers4.6-gpu-py37-cu110-ubuntu18.04',
     'EnableSageMakerMetricsTimeSeries': True},
    'Outpu

In [19]:
response = pipeline.create(role_arn=role)
response

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:892313895307:pipeline/nlp-pipeline-07-19-01-37',
 'ResponseMetadata': {'RequestId': '80f51c83-b2a3-4aad-af34-deba663607de',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '80f51c83-b2a3-4aad-af34-deba663607de',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '92',
   'date': 'Mon, 19 Jul 2021 01:37:07 GMT'},
  'RetryAttempts': 0}}

## 6. Execute Pipeline

In [20]:
execution = pipeline.start()
execution.arn

'arn:aws:sagemaker:us-east-1:892313895307:pipeline/nlp-pipeline-07-19-01-37/execution/dsaajdp2h647'

In [21]:
status = execution.describe()
status

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:892313895307:pipeline/nlp-pipeline-07-19-01-37',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:892313895307:pipeline/nlp-pipeline-07-19-01-37/execution/dsaajdp2h647',
 'PipelineExecutionDisplayName': 'execution-1626658628689',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2021, 7, 19, 1, 37, 8, 625000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2021, 7, 19, 1, 37, 8, 625000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:892313895307:user-profile/d-dowart1jabkf/ts-zd-e2e',
  'UserProfileName': 'ts-zd-e2e',
  'DomainId': 'd-dowart1jabkf'},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:892313895307:user-profile/d-dowart1jabkf/ts-zd-e2e',
  'UserProfileName': 'ts-zd-e2e',
  'DomainId': 'd-dowart1jabkf'},
 'ResponseMetadata': {'RequestId': '40496380-c3ec-41e4-92cb-370ca8a8d652',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid'