In [1]:
# First, let's get the latest installations of our dependencies
!pip install --upgrade pip
!pip install boto3 --upgrade
!pip install -U botocore

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/43/84/23ed6a1796480a6f1a2d38f2802901d078266bda38388954d01d3f2e821d/pip-20.1.1-py2.py3-none-any.whl (1.5MB)
[K    100% |████████████████████████████████| 1.5MB 22.6MB/s ta 0:00:01
[?25hInstalling collected packages: pip
  Found existing installation: pip 10.0.1
    Uninstalling pip-10.0.1:
      Successfully uninstalled pip-10.0.1
Successfully installed pip-20.1.1
[33mYou are using pip version 20.1.1, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting boto3
  Downloading boto3-1.13.19-py2.py3-none-any.whl (128 kB)
[K     |████████████████████████████████| 128 kB 11.6 MB/s eta 0:00:01
[?25hCollecting s3transfer<0.4.0,>=0.3.0
  Downloading s3transfer-0.3.3-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 10.0 MB/s eta 0:00:01
[?25hCollecting botocore<1.17.0,>=1.16.19
  Downloading botocore-1.16.19-py2.py3-

# Environment Setup

We need to set up the following data:

    REGION - Region to call A2I.
    BUCKET_NAME - A S3 bucket accessible by the given role
        Used to store the input files and output results
        Must be within the same region A2I is called from
    WORKTEAM_ARN - To create your Private Workteam, visit the instructions here: https://docs.aws.amazon.com/sagemaker/latest/dg/sms-workforce-private.html After you have created your workteam, replace \<YOUR-WORKTEAM-ARN> below
    ROLE - The IAM role used as part of StartHumanLoop. By default, this notebook will use the execution role. You can learn more about IAM Policies here https://docs.aws.amazon.com/IAM/latest/UserGuide/access_policies.html



In [94]:


REGION = 'us-east-1'
BUCKET_NAME = 'comprehend-data-label'
WORKTEAM_ARN= "arn:aws:sagemaker:us-east-1:820570838999:workteam/private-crowd/Comprehend"



In [95]:
from sagemaker import get_execution_role
import sagemaker

# Setting Role to the default SageMaker Execution Role
ROLE = get_execution_role()
display(ROLE)

'arn:aws:iam::820570838999:role/service-role/sgdemo-AmazonSageMaker-ExecutionRole'

In [96]:
#Setup Bucket and Paths
import os
import boto3
import botocore

sess = sagemaker.Session()

# Client Setup

Let's setup the clients for Amazon S3, Amazon SageMaker A2I Runtime and Amazon Comprehend.


In [97]:
import boto3
import io
import json
import uuid
import botocore
import time
import botocore

# Amazon SageMaker client
sagemaker = boto3.client('sagemaker', REGION)

# Amazon Comprehend client
comprehend = boto3.client('comprehend', REGION)

# S3 client
s3 = boto3.client('s3', REGION)

# A2I Runtime client
a2i_runtime_client = boto3.client('sagemaker-a2i-runtime', REGION)

In [98]:
import pprint

# Pretty print setup
pp = pprint.PrettyPrinter(indent=2)

# Function to pretty-print AWS SDK responses
def print_response(response):
    if 'ResponseMetadata' in response:
        del response['ResponseMetadata']
    pp.pprint(response)

# Sample Data

Let's create some sample text that we would test our translation with and store it in S3.


In [99]:
translation_text = """
Just then another visitor entered the drawing room: Prince Andrew Bolkónski, the little princess’ husband. He was a very handsome young man, of medium height, with firm, clearcut features. Everything about him, from his weary, bored expression to his quiet, measured step, offered a most striking contrast to his quiet, little wife. It was evident that he not only knew everyone in the drawing room, but had found them to be so tiresome that it wearied him to look at or listen to them. And among all these faces that he found so tedious, none seemed to bore him so much as that of his pretty wife. He turned away from her with a grimace that distorted his handsome face, kissed Anna Pávlovna’s hand, and screwing up his eyes scanned the whole company.
"""

key = "input/test.txt"

s3.put_object(Bucket=BUCKET_NAME, Key=key, Body=translation_text)

{'ResponseMetadata': {'RequestId': 'C0DFA7E8C61763A9',
  'HostId': 'ziS04WXFEKDcVcMCPjoR98EeEg35Rv5eOsxzjmBDj6p27NsMMIYEbYXzxif02/mRFuW86OcfI+g=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'ziS04WXFEKDcVcMCPjoR98EeEg35Rv5eOsxzjmBDj6p27NsMMIYEbYXzxif02/mRFuW86OcfI+g=',
   'x-amz-request-id': 'C0DFA7E8C61763A9',
   'date': 'Wed, 03 Jun 2020 01:21:20 GMT',
   'x-amz-version-id': 'IPcN1E2HLxWLBkgEpAk5JVZVgBezaVH7',
   'etag': '"ba9b13b50673313a99cee6a1d8fdc1c6"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"ba9b13b50673313a99cee6a1d8fdc1c6"',
 'VersionId': 'IPcN1E2HLxWLBkgEpAk5JVZVgBezaVH7'}

# Create Control Plane Resources
Create a Worker Task Tempalte

Create a human task UI resource, giving a UI template in liquid html. This template will be rendered to the human workers whenever human loop is required.

For over 70 pre built UIs, check: https://github.com/aws-samples/amazon-a2i-sample-task-uis.

We will be taking translation review and correction UI and filling in the object categories in the labels variable in the template.


In [265]:
template = """

<script src="https://assets.crowd.aws/crowd-html-elements.js"></script>

<crowd-entity-annotation
  name="entities"
  header="Highlight parts of the text below"
  labels="{{ task.input.labels | to_json | escape }}"
  text="{{task.input.taskObject}}"

  >
  <full-instructions header="Named entity recognition instructions">
    <ol>
      <li><strong>Read</strong> the text carefully.</li>
      <li><strong>Highlight</strong> words, phrases, or sections of the text.</li>
      <li><strong>Choose</strong> the label that best matches what you have highlighted.</li>
      <li>To <strong>change</strong> a label, choose highlighted text and select a new label.</li>
      <li>To <strong>remove</strong> a label from highlighted text, choose the X next to the abbreviated label name on the highlighted text.</li>
      <li>You can select all of a previously highlighted text, but not a portion of it.</li>
    </ol>
  </full-instructions>

  <short-instructions>
    Apply labels to words or phrases.
  </short-instructions>

   

"""

# Create a Worker Task Template Creator Function

This function would be a higher level abstration, on the SageMaker package's method to create the Worker Task Template which we will use in the next step to create a human review workflow.


In [266]:
def create_task_ui(task_ui_name, template):
    '''
    Creates a Human Task UI resource.

    Returns:
    struct: HumanTaskUiArn
    '''
    response = sagemaker.create_human_task_ui(
        HumanTaskUiName=task_ui_name,
        UiTemplate={'Content': template})
    return response

In [268]:


# Task UI name - this value is unique per account and region. You can also provide your own value here.
taskUIName = 'a2i-comprehend-test-12-ue-1'

# Create task UI
humanTaskUiResponse = create_task_ui(taskUIName, template)
humanTaskUiArn = humanTaskUiResponse['HumanTaskUiArn']
print(humanTaskUiArn)



arn:aws:sagemaker:us-east-1:820570838999:human-task-ui/a2i-comprehend-test-12-ue-1


# Creating the Flow Definition

In this section, we're going to create a flow definition definition. Flow Definitions allow us to specify:

    The workforce that your tasks will be sent to.
    The instructions that your workforce will receive. This is called a worker task template.
    Where your output data will be stored.

This demo is going to use the API, but you can optionally create this workflow definition in the console as well.

For more details and instructions, see: https://docs.aws.amazon.com/sagemaker/latest/dg/a2i-create-flow-definition.html.


In [269]:
def create_flow_definition(flow_definition_name):
    '''
    Creates a Flow Definition resource

    Returns:
    struct: FlowDefinitionArn
    '''
    response = sagemaker.create_flow_definition(
            FlowDefinitionName= flow_definition_name,
            RoleArn= ROLE,
            HumanLoopConfig= {
                "WorkteamArn": WORKTEAM_ARN,
                "HumanTaskUiArn": humanTaskUiArn,
                "TaskCount": 1,
                "TaskDescription": "Please review the entities and labels done using Amazon Comprehend and make corrections and improvements.",
                "TaskTitle": "Review and Improve entity."
            },
            OutputConfig={
                "S3OutputPath" : "s3://"+BUCKET_NAME+"/"
            }
        )
    
    return response['FlowDefinitionArn']

# Now we are ready to create our flow definition

In [270]:


# Flow definition name - this value is unique per account and region. You can also provide your own value here.
uniqueId = str(uuid.uuid4())
flowDefinitionName = f'comprehend-a2i-{uniqueId}' 

flowDefinitionArn = create_flow_definition(flowDefinitionName)
print(flowDefinitionArn)



arn:aws:sagemaker:us-east-1:820570838999:flow-definition/comprehend-a2i-6943a9bd-c543-4b52-bd10-2fd9434e9d2f


# Data Load

In [271]:
# Get file from S3 and load it into a variable
file_contents = s3.get_object(Bucket=BUCKET_NAME, Key=key)['Body'].read().decode("utf-8", 'ignore')

# Get just the filename without prefix or suffix
fileName = key[key.rindex('/')+1:key.rindex('.')]
print(fileName)

test


# Comprehend Documents

Now that we have the Human Review Workflow set up, we can comprehend our documents and pass them over to a Human Loop for review.


In [274]:
# Create the human loop input JSON object
humanLoopInput = {
    'SourceLanguage' : 'English',
    'sourceLanguageCode':'en',
    'rowCount': 0,
    'labels' : [],
    'taskObject':[],
    'bucketName': BUCKET_NAME,
    'keyName': key
}

translatedText = ''
rowCount = 0

print('Splitting file and performing translation')    
textvalues=[]
# split the body by period to get individual sentences
for sentence in file_contents.split('.'):
    if len(sentence.lstrip()) > 0:
        # call translation
        comprehend_response = comprehend.detect_entities(
                                Text=sentence + '.',
                                LanguageCode='en')

        entities = comprehend_response['Entities']
      
        textvalues=[]
        for s in entities:
                 textvalues.append(s.get("Type"))
        set(textvalues)
        str1 = ';'.join(textvalues)
        originalText = sentence + ' '
        labels={ 'label':str1}
        taskObject = {
                        'originalText': sentence + '.'
                            }
        
        #humanLoopInput['taskObject'].append(taskObject)
        
        rowCount+=1
        humanLoopInput['taskObject'].append(taskObject)       
        humanLoopInput['labels'].append(labels)       
print(humanLoopInput)
        

humanLoopInput['rowCount'] = rowCount



Splitting file and performing translation
{'SourceLanguage': 'English', 'sourceLanguageCode': 'en', 'rowCount': 0, 'labels': [{'label': 'PERSON'}, {'label': ''}, {'label': ''}, {'label': ''}, {'label': ''}, {'label': 'PERSON'}], 'taskObject': [{'originalText': '\nJust then another visitor entered the drawing room: Prince Andrew Bolkónski, the little princess’ husband.'}, {'originalText': ' He was a very handsome young man, of medium height, with firm, clearcut features.'}, {'originalText': ' Everything about him, from his weary, bored expression to his quiet, measured step, offered a most striking contrast to his quiet, little wife.'}, {'originalText': ' It was evident that he not only knew everyone in the drawing room, but had found them to be so tiresome that it wearied him to look at or listen to them.'}, {'originalText': ' And among all these faces that he found so tedious, none seemed to bore him so much as that of his pretty wife.'}, {'originalText': ' He turned away from her wit

In [275]:
humanLoopName = 'Comprehend-A2I-Text' + str(int(round(time.time() * 1000)))
print('Starting human loop - ' + humanLoopName)
response = a2i_runtime_client.start_human_loop(
                            HumanLoopName=humanLoopName,
                            FlowDefinitionArn= flowDefinitionArn,
                            HumanLoopInput={
                                'InputContent': json.dumps(humanLoopInput)
                                }
                            )

# write the machine translated file to S3 bucket.
targetKey = ('machine_output/MO-{0}.txt').format(fileName)
print ('Writing translated text to '+ BUCKET_NAME + '/' + targetKey)
s3.put_object(Bucket=BUCKET_NAME, Key=targetKey, Body=translatedText.encode('utf-8'))

Starting human loop - Comprehend-A2I-Text1591733581953
Writing translated text to comprehend-data-label/machine_output/MO-test.txt


{'ResponseMetadata': {'RequestId': 'A6A1D06641439F92',
  'HostId': 'AHCt+LItmiSaeRoJOx8u52MzlvceXEh8xCv8AmAxYC6ehzLZvpp+XQMZMBDf8THa/YLBDj0bW0s=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'AHCt+LItmiSaeRoJOx8u52MzlvceXEh8xCv8AmAxYC6ehzLZvpp+XQMZMBDf8THa/YLBDj0bW0s=',
   'x-amz-request-id': 'A6A1D06641439F92',
   'date': 'Tue, 09 Jun 2020 20:13:03 GMT',
   'x-amz-version-id': 'Nk50gmevnCzTROad34n3TBvEmGKhYL8d',
   'etag': '"d41d8cd98f00b204e9800998ecf8427e"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
 'VersionId': 'Nk50gmevnCzTROad34n3TBvEmGKhYL8d'}

# Check Status of Human Loop

Let's define a function that allows us to check the status of Human Loop progress.



In [276]:

resp = a2i_runtime_client.describe_human_loop(HumanLoopName=humanLoopName)
print(f'HumanLoop Name: {humanLoopName}')
print(f'HumanLoop Status: {resp["HumanLoopStatus"]}')
print(f'HumanLoop Output Destination: {resp["HumanLoopOutput"]}')
print('\n')

humanLoopStatus = resp["HumanLoopStatus"]
outputFilePath = resp["HumanLoopOutput"]

HumanLoop Name: Comprehend-A2I-Text1591733581953
HumanLoop Status: InProgress
HumanLoop Output Destination: {'OutputS3Uri': 's3://comprehend-data-label/comprehend-a2i-6943a9bd-c543-4b52-bd10-2fd9434e9d2f/2020/06/09/20/13/02/Comprehend-A2I-Text1591733581953/output.json'}




# Wait For Work Team to Complete Task

In [277]:
workteamName = WORKTEAM_ARN[WORKTEAM_ARN.rfind('/') + 1:]
print("Navigate to the private worker portal and do the tasks. Make sure you've invited yourself to your workteam!")
print('https://' + sagemaker.describe_workteam(WorkteamName=workteamName)['Workteam']['SubDomain'])

Navigate to the private worker portal and do the tasks. Make sure you've invited yourself to your workteam!
https://1ajv1yl4hz.labeling.us-east-1.sagemaker.aws


# Check Status of Human Loop Again and process Task Results

Once the Human Loop Status has changed to completed, you can post process the results to build the final file, with Human Reviewed corrections, for future use.


In [278]:
resp = a2i_runtime_client.describe_human_loop(HumanLoopName=humanLoopName)
humanLoopStatus = resp["HumanLoopStatus"]
outputFilePath = resp["HumanLoopOutput"]['OutputS3Uri']

if humanLoopStatus == "Completed":
    # Remove s3:// from S3 File Path
    outputFilePath = outputFilePath.replace("s3://", "")

    # recreate the output text document, including post edits.
    tmsFile = s3.get_object(Bucket=outputFilePath.split('/')[0],
                                Key="/".join(outputFilePath.split('/')[1:]))['Body'].read()

    tmsFile = json.loads(tmsFile.decode('utf-8'))
    inputContent = tmsFile['inputContent']
    rowcount = inputContent['rowCount']
    answerContent = tmsFile['humanAnswers'][0]['answerContent']
    editedContent = ''

    # extract the file name
    targetKeyName = inputContent['keyName']
    targetKeyName = targetKeyName[targetKeyName.index('/') + 1: len(targetKeyName)]

    # save the file.
    s3.put_object(Bucket=BUCKET_NAME,
                      Key='post_edits/PO-{0}'.format(targetKeyName),
                    Body=editedContent.encode('utf-8'))

    print("Output File successfully stored in s3://{0}/post_edits/PO-{1}".format(BUCKET_NAME,targetKeyName))
elif humanLoopStatus == "InProgress":
    print("Navigate to the private worker portal and do the tasks. Make sure you've invited yourself to your workteam!")
    print('https://' + sagemaker.describe_workteam(WorkteamName=workteamName)['Workteam']['SubDomain'])

Output File successfully stored in s3://comprehend-data-label/post_edits/PO-test.txt
