# Custom negative custom classifier.

This notebook covers how to prepare a training dataset for a negative custom classifier in Amazon Comprehend leveraging the custom keywords that were generated from our word2vec model. 

We will build a custom negative classifier based on keywords semantically similar to the word "frustrated"



In [86]:
# library imports
import re
import numpy as np
import pandas as pd
import matplotlib
import csv
import boto3
import json
import time
import os
import datetime 

from sagemaker import get_execution_role
from sagemaker.session import Session

comprehend = boto3.client('comprehend')

# Specify S3 bucket and prefix that you want to use for model data
# Feel free to specify a different bucket here if you wish.
bucket = 'data-phi'
prefix = 'comprehend-custom-entity'

role = get_execution_role()
print(role)

arn:aws:iam::202860692096:role/service-role/AmazonSageMaker-ExecutionRole-20180529T141286


In this example we will re-use the dataset that we wrangled and filtered for the telco domain. 

In [87]:
colnames=['text'] 
tweets = pd.read_csv('./data/tweet_telco.csv',encoding='utf-8',names=colnames, header=None)
print(tweets.shape)
tweets.head()

(32716, 1)


Unnamed: 0,text
0,@sprintcare is the worst customer service | @1...
1,@sprintcare is the worst customer service | @1...
2,@sprintcare is the worst customer service | @1...
3,@115714 y’all lie about your “great” connectio...
4,"@115714 whenever I contact customer support, t..."


<a id='data-wrangling'></a>

In order to create our dataset we need to label the dataset.

In order to find relevant records, we will be using our custom word2vec model to find semantically similar words to "frustrated". See the blazingtext_word2vec_telco_tweets.ipynb notebook for generating keywords.

In [88]:
tweets['match_negative']=tweets['text'].str.contains(r'(Really|cheated|annoyed|unhelpful|frustrated|upset|unhappy|angry|badly|bad|dissatisfied|disappointed|disgusted)', regex=True)


  if __name__ == '__main__':


In [90]:
tweets['match_positive']=tweets['text'].str.contains(r'(Awesome|AWESOME|Awesome!|Yay!|Hero|Whoop|#YouRock!|Super|Awww!)', regex=True)

  if __name__ == '__main__':



Let's add another column with our class label. This is required part of the Amazon Comprehend training dataset.

More information can be found here.

https://docs.aws.amazon.com/comprehend/latest/dg/cer-entity-list.html


In [91]:
tweets.loc[tweets['match_negative'] == True, 'label'] = 'NEGATIVE'
tweets.loc[tweets['match_positive'] == True, 'label'] = 'POSITIVE'

In [94]:
tweets.groupby('label').count()

Unnamed: 0_level_0,text,match_negative,match_positive
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NEGATIVE,1445,1445,1445
POSITIVE,254,254,254


Let's create our training and test file.

In [98]:
training_file = 'negative_classifier_train.csv'
tweets.loc[tweets['label'].notnull(), ['label', 'text']].to_csv(training_file, encoding='utf-8', index=False)

#test_file = 'telco_negative_test.csv'
#tweets['text'].tail(10000).to_csv(test_file, encoding='utf-8', index=False)

In [99]:
def upload_to_s3(channel, file):
    s3 = boto3.resource('s3')
    data = open(file, "rb")
    key = channel + '/' + file
    s3.Bucket(bucket).put_object(Key=key, Body=data)

s3_train_key = prefix + "/train"
s3_test_key = prefix + "/test"

upload_to_s3(s3_train_key, training_file)
upload_to_s3(s3_test_key, test_file)

In [100]:
s3_train_data = 's3://{}/{}/{}'.format(bucket, s3_train_key, training_file)
s3_test_job = 's3://{}/{}/{}'.format(bucket, s3_test_key, test_file)
s3_output_job = 's3://{}/{}/{}'.format(bucket, prefix, 'output/train_job')
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://data-phi/comprehend-custom-entity/train/negative_classifier_train.csv


In [101]:
print(str(datetime.datetime.now().strftime("%s")))
#dt.strftime("%s")


1574303977


## Training our model

In [102]:
utc = str(datetime.datetime.now().strftime("%s"))
training_job = comprehend.create_document_classifier(
    DocumentClassifierName='Custom-Negative-Classifier-'+ utc,
    DataAccessRoleArn=role,
    InputDataConfig={
        'S3Uri': s3_train_data
    },
    OutputDataConfig={
        'S3Uri': s3_output_job
    },
    LanguageCode='en'
)

In [103]:
print(json.dumps(training_job, indent=2, default=str))


{
  "DocumentClassifierArn": "arn:aws:comprehend:us-east-1:202860692096:document-classifier/Custom-Negative-Classifier-1574303978",
  "ResponseMetadata": {
    "RequestId": "c2b614f1-f423-47ec-8bed-317cfd6f4309",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "x-amzn-requestid": "c2b614f1-f423-47ec-8bed-317cfd6f4309",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "127",
      "date": "Thu, 21 Nov 2019 02:39:37 GMT"
    },
    "RetryAttempts": 0
  }
}


In [104]:
jobArn = training_job['DocumentClassifierArn']

In [105]:
print(json.dumps(comprehend.describe_document_classifier(
        DocumentClassifierArn = jobArn
    ), indent=2, default=str))

{
  "DocumentClassifierProperties": {
    "DocumentClassifierArn": "arn:aws:comprehend:us-east-1:202860692096:document-classifier/Custom-Negative-Classifier-1574303978",
    "LanguageCode": "en",
    "Status": "SUBMITTED",
    "SubmitTime": "2019-11-21 02:39:38.615000+00:00",
    "InputDataConfig": {
      "S3Uri": "s3://data-phi/comprehend-custom-entity/train/negative_classifier_train.csv"
    },
    "OutputDataConfig": {
      "S3Uri": "s3://data-phi/comprehend-custom-entity/output/train_job/202860692096-CLR-dca23527a44c92eafbacbd416e3ca928/output/output.tar.gz"
    },
    "DataAccessRoleArn": "arn:aws:iam::202860692096:role/service-role/AmazonSageMaker-ExecutionRole-20180529T141286"
  },
  "ResponseMetadata": {
    "RequestId": "a93f351a-862c-4368-8bca-e0ecc12fa070",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "x-amzn-requestid": "a93f351a-862c-4368-8bca-e0ecc12fa070",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "627",
      "date": "Thu, 21

In [106]:
jobArn = training_job['DocumentClassifierArn']

max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_custom_classifier = comprehend.describe_document_classifier(
        DocumentClassifierArn = jobArn
    )
    status = describe_custom_classifier["DocumentClassifierProperties"]["Status"]
    print("Custom classifier: {}".format(status))
    
    if status == "TRAINED" or status == "IN_ERROR":
        break
        
    time.sleep(60)

Custom classifier: SUBMITTED
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINED


## Confusion matrix

In [107]:
#Retrieve the S3URI from the model output and create jobkey variable.
job_output = describe_custom_classifier["DocumentClassifierProperties"]["OutputDataConfig"]["S3Uri"]
path_prefix = 's3://{}/'.format(bucket)
job_key = os.path.relpath(job_output, path_prefix)

s3://data-phi/comprehend-custom-entity/output/train_job/202860692096-CLR-dca23527a44c92eafbacbd416e3ca928/output/output.tar.gz


In [109]:
#Download the model metrics
s3 = boto3.resource('s3')
s3.Bucket(bucket).download_file(job_key, './output.tar.gz')

In [110]:
#Unpack the gzip file
!tar xvzf ./output.tar.gz

output/
output/confusion_matrix.json


In [112]:
import json

with open('./output/confusion_matrix.json') as json_file:
    data = json.load(json_file)
print(json.dumps(data, indent=2, default=str))

{
  "confusion_matrix": [
    [
      142,
      2
    ],
    [
      7,
      18
    ]
  ],
  "labels": [
    "NEGATIVE",
    "POSITIVE"
  ],
  "type": "multi_class",
  "all_labels": [
    "NEGATIVE",
    "POSITIVE",
    "label"
  ]
}


            NEGATIVE  POSITIVE
NEGATIVE    142        2
POSITIVE    7          18