
## How to prepare a dataset and submit a custom entity in Amazon Comprehend

This notebook covers how to prepare a training dataset for custom entities in Amazon Comprehend

More information on how to create a custom entity recognizer model can be found here.

https://docs.aws.amazon.com/comprehend/latest/dg/training-recognizers.html


In [None]:
# library imports
import re
import numpy as np
import pandas as pd
import matplotlib
import csv
import boto3

In this example we will be using the following tweet dataset. https://www.kaggle.com/thoughtvector/customer-support-on-twitter First lets get our data and process it to our needs

In [None]:
CUSTOM_NER_PREFIX = 'TweetTelco'
BUCKET = 'comprehend-ner-20200602'
role = 'arn:aws:iam::951145066533:role/service-role/AmazonComprehendServiceRole-cmpdner'

In [None]:

s3_file = 'data/tweet_telco.csv'
s3 = boto3.client('s3')
s3.download_file(BUCKET,s3_file,s3_file)


In [None]:
colnames=['text'] 
tweets = pd.read_csv('data/tweet_telco.csv',encoding='utf-8',names=colnames, header=None)
print(tweets.shape)
tweets.head()

In order to create our dataset we need to provide an entity list for our new class named NEGATIVITY.

In order to find relevant entities, we used another blazingtext to find similar words using word2vec. See ./blazingtext_word2vec/blazingtext_word2vec_telco_tweets.ipynb for examples of retrieving such keywords.

In [None]:
negative_words = ['Really', 'cheated', 'annoyed', 'unhelpful', 'frustrated', 'upset' , 'unhappy', 'angry', 'badly', 'bad', 'surprised', 'sadly', 'dissatisfied', 'disappointed', 'disgusted']

df_entity_list = pd.DataFrame(negative_words, columns=['Text'])

Let's add another column with our class label. This is required part of the Amazon Comprehend training dataset.

More information can be found here.

https://docs.aws.amazon.com/comprehend/latest/dg/cer-entity-list.html

In [None]:
df_entity_list['Type'] = 'NEGATIVE'

Let's create a training file

In [None]:
tweets['text'].to_csv('data/raw_negative.csv', encoding='utf-8', index=False,header=False)

In [None]:
!head data/raw_negative.csv

Let's create the entity list file

In [None]:
df_entity_list.to_csv('data/entity_negative_list.csv', encoding='utf-8', index=False)

In [None]:
!head data/entity_negative_list.csv

In [None]:
entity_file = 'data/entity_negative_list.csv'
s3.upload_file(entity_file, BUCKET, entity_file)

Let's create a test file from our original telco tweet dataset.

In [None]:
train_file = 'data/telco_negative_test.csv'
tweets['text'].tail(10000).to_csv(train_file, encoding='utf-8', index=False,header=False)
s3.upload_file(train_file, BUCKET, train_file)

In [None]:
test_file = 'data/telco_negative_test.csv'
tweets['text'].head(10000).to_csv(test_file, encoding='utf-8', index=False,header=False)
s3.upload_file(test_file, BUCKET, test_file)

### Training the custom NER Model

In [None]:
s3_entity_key = entity_file
s3_train_key = train_file
s3_test_key = test_file

prefix = CUSTOM_NER_PREFIX

#Create s3 paths
s3_train_data = 's3://{}/{}'.format(BUCKET, s3_train_key)
s3_train_entity = 's3://{}/{}'.format(BUCKET, s3_entity_key)
s3_test_data = 's3://{}/{}'.format(BUCKET, s3_test_key)
s3_output_test_data = 's3://{}/{}/test/{}'.format(BUCKET, prefix, "telco_test_output.json")
print('uploaded training data location: {}'.format(s3_train_data))

In [None]:
# Instantiate Boto3 Client
comprehend = boto3.client('comprehend', region_name='us-east-1')

custom_entity_request = {

      "Documents": { 
         "S3Uri": s3_train_data
      },
      "EntityList": { 
         "S3Uri": s3_train_entity
      },
      "EntityTypes": [ 
         { 
            "Type": "NEGATIVE"
         }
      ]
   
}


# Create a document classifier
id = str(datetime.datetime.now().strftime("%s"))
create_custom_entity_response = comprehend.create_entity_recognizer(
        RecognizerName = CUSTOM_NER_PREFIX+id, 
        DataAccessRoleArn = role,
        InputDataConfig = custom_entity_request,
        LanguageCode = "en"
)
print("Create response: %s\n", create_custom_entity_response)

In [None]:
create_custom_entity_response['EntityRecognizerArn']

In [None]:
# Check the status of the classifier
describe_response = comprehend.describe_entity_recognizer(
    EntityRecognizerArn=create_custom_entity_response['EntityRecognizerArn'])
print("Describe response: %s", describe_response['EntityRecognizerProperties']['Status'])

jobArn = create_custom_entity_response['EntityRecognizerArn']

max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_custom_recognizer = comprehend.describe_entity_recognizer(
        EntityRecognizerArn = jobArn
    )
    status = describe_custom_recognizer["EntityRecognizerProperties"]["Status"]
    print("Custom entity recognizer: {}".format(status))
    
    if status == "TRAINED" or status == "IN_ERROR":
        break
        
    time.sleep(60)

### Testing our Model

In [None]:
print(json.dumps(describe_custom_recognizer["EntityRecognizerProperties"]["RecognizerMetadata"]["EntityTypes"], indent=2, default=str))

In [None]:
test_response = comprehend.start_entities_detection_job(
    InputDataConfig={
        'S3Uri': s3_test_data,
        'InputFormat': 'ONE_DOC_PER_LINE'
    },
    OutputDataConfig={
        'S3Uri': s3_output_test_data
    },
    DataAccessRoleArn=role,
    JobName='Custom_Negative_Test',
    EntityRecognizerArn=jobArn,
    LanguageCode='en'
)


Lets monitor the job

In [None]:
jobId = test_response['JobId']
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_job = comprehend.describe_entities_detection_job(
        JobId = jobId
    )
    status = describe_job["EntitiesDetectionJobProperties"]["JobStatus"]
    print("Job Status: {}".format(status))
    
    if status == "COMPLETED" or status == "FAILED":
        break
        
    time.sleep(60)

In [None]:
job_key

In [None]:
#Download the test output to local machine
job_output = describe_job["EntitiesDetectionJobProperties"]["OutputDataConfig"]["S3Uri"]
path_prefix = 's3://{}/'.format(BUCKET)
job_key = os.path.relpath(job_output, path_prefix)

s3 = boto3.resource('s3')
s3.Bucket(BUCKET).download_file(job_key, 'output.tar.gz')

In [None]:
!tar xvzf output.tar.gz

In [None]:
#Load all the Entities values in a list
import json

data = []
for line in open('output', 'r'):
    entities = json.loads(line)['Entities']
    if entities != None and len(entities) > 0:
        data.append(entities[0]['Text'])
    

# function to get unique values 
def unique(list1): 
      
    # insert the list to the set 
    list_set = set(list1) 
    # convert the set to the list 
    unique_list = (list(list_set)) 
    for x in unique_list: 
        print(x), 
        
unique(data)

### Create and use an endpoint