# Entrenamiento de un modelo clasificador custom de Amazon Comprehend

## Hay que tener en cuenta que Amazon Comprehend sólo es compatible con un subconjunto de regiones:

* Este de EE.UU. (N. Virginia), Este de EE.UU. (Ohio), Oeste de EE.UU. (Oregón)
* Canadá (Central)
* Europa (Londres), Europa (Irlanda), Europa (Frankfurt)
* Asia Pacífico (Bombay), Asia Pacífico (Seúl), Asia Pacífico (Tokio), Asia Pacífico (Singapur), Asia Pacífico (Sidney)

Puedes consultar https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services

## Check dependencies setup

In [None]:
%store -r setup_dependencies_passed
%store -r comprehend_train_s3_uri

try:
    setup_dependencies_passed
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN THE PREVIOUS NOTEBOOK ")
    print("You did not install the required libraries.   ")
    print("++++++++++++++++++++++++++++++++++++++++++++++")

if not setup_dependencies_passed:
    print("++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN THE PREVIOUS NOTEBOOK ")
    print("You did not install the required libraries.   ")
    print("++++++++++++++++++++++++++++++++++++++++++++++")
if not comprehend_train_s3_uri:
    print("****************************************************************************************")
    print("**************** PLEASE RE-RUN THE PREVIOUS DATA PREPARATION NOTEBOOK ******************")
    print("**************** THIS NOTEBOOK WILL NOT RUN PROPERLY ***********************************")
    print("****************************************************************************************")
else:
    print("[OK] Everything is correctly set up")

In [None]:
!aws s3 ls $comprehend_train_s3_uri

## Setup notebook

In [None]:
import csv
import datetime
import json
import time

import boto3
import pandas as pd
import sagemaker
from botocore.exceptions import ClientError

region = boto3.Session().region_name
sm = sagemaker.Session()
bucket = sm.default_bucket()
role = sagemaker.get_execution_role()

from botocore.config import Config

config = Config(retries={"max_attempts": 10, "mode": "adaptive"})

iam = boto3.client("iam", config=config)

## Comprobamos si nuestra región está soportada

In [None]:
if region in [
    "ap-south-1",
    "eu-west-2",
    "eu-west-1",
    "ap-northeast-2",
    "ap-northeast-1",
    "ca-central-1",
    "ap-southeast-1",
    "ap-southeast-2",
    "eu-central-1",
    "us-east-1",
    "us-east-2",
    "us-west-2",
]:
    print(f" [OK] COMPREHEND IS SUPPORTED IN {region}")
    print(" [OK] Please proceed with this notebook.")
else:
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print(f" [ERROR] COMPREHEND IS NOT YET SUPPORTED IN {region}.")
    print(" [INFO] This is OK. Skip this notebook and continue with the next use case.")
    print(" [INFO] This notebook is not required for the rest of this workshop.")
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

In [None]:
comprehend = boto3.client("comprehend")

## Observamos nuestros datos de entrenamiento que van a ser usados como input para Comprehend

In [None]:
!aws s3 cp $comprehend_train_s3_uri ./tmp/

temp_folder = "tmp"
dataset_csv = "amazon_reviews_us_Digital_Software_v1_00_comprehend.csv"

In [None]:
df = pd.read_csv("./tmp/amazon_reviews_us_Digital_Software_v1_00_comprehend.csv", header=None)
df.head()

## Creamos un rol de acceso a los datos para Comprehend

## Creamos la política

In [None]:
assume_role_policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "comprehend.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ],
}

## Creamos el rol y le asociamos la política

In [None]:
iam_comprehend_role_name = "DSOAWS_Comprehend"
iam_comprehend_role_description="Curso MLOps Comprehend Role"

In [None]:
try:
    iam_role_comprehend = iam.create_role(
        RoleName=iam_comprehend_role_name,
        AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
        Description=iam_comprehend_role_description,
    )
except ClientError as e:
    if e.response["Error"]["Code"] == "EntityAlreadyExists":
        iam_role_comprehend = iam.get_role(RoleName=iam_comprehend_role_name)
        print("Role already exists")
    else:
        print("Unexpected error: %s" % e)

In [None]:
comprehend_s3_policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Action": ["s3:GetObject"],
            "Resource": [f"arn:aws:s3:::{bucket}/*"],
            "Effect": "Allow"
        },
        {
            "Action": ["s3:ListBucket"],
            "Resource": [f"arn:aws:s3:::{bucket}"],
            "Effect": "Allow"
        },
        {
            "Action": ["s3:PutObject"],
            "Resource": [f"arn:aws:s3:::{bucket}/*"],
            "Effect": "Allow"
        },
    ],
}

In [None]:
response = iam.put_role_policy(
    RoleName=iam_comprehend_role_name,
    PolicyName="DSOAWS_ComprehendPolicyToS3",
    PolicyDocument=json.dumps(comprehend_s3_policy_doc),
)

print(response)

## Entrenamos el modelo

In [None]:
prefix = "models"
key = "comprehend/output"

s3_output_job = f"s3://{bucket}/{prefix}/{key}"
print(s3_output_job)

In [None]:
iam_role_comprehend_arn = iam_role_comprehend["Role"]["Arn"]

In [None]:
timestamp = str(datetime.datetime.now().strftime("%s"))

comprehend_training_job_name = f"Amazon-Customer-Reviews-Classifier-{timestamp}"

In [None]:
print(comprehend_training_job_name)
print(iam_role_comprehend_arn)
print(comprehend_train_s3_uri)
print(s3_output_job)

In [None]:
training_job = comprehend.create_document_classifier(
    DocumentClassifierName=comprehend_training_job_name,
    DataAccessRoleArn=iam_role_comprehend_arn,
    InputDataConfig={"S3Uri": comprehend_train_s3_uri},
    OutputDataConfig={"S3Uri": s3_output_job},
    LanguageCode="en",
)

In [None]:
comprehend_training_job_arn = training_job["DocumentClassifierArn"]

print(comprehend_training_job_arn)

In [None]:
from IPython.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/comprehend/v2/home?region={}#classifier-details/{}">Comprehend Training Job</a></b>'.format(
            region, comprehend_training_job_arn
        )
    )
)

## La siguiente celda tarda en ejecutarse unos 40 minutos, se paciente (puedes continuar con el siguiente notebook)

In [None]:
%%time

import time

max_time = time.time() + 3 * 60 * 60  # 3 hours
while time.time() < max_time:
    describe_custom_classifier = comprehend.describe_document_classifier(
        DocumentClassifierArn=comprehend_training_job_arn
    )
    status = describe_custom_classifier["DocumentClassifierProperties"]["Status"]
    print(f"Custom classifier: {status}")

    if status == "TRAINED" or status == "IN_ERROR":
        print("")
        print(f"Status {status}")
        print("")
        print(describe_custom_classifier["DocumentClassifierProperties"])
        break

    time.sleep(10)

## Mostramos los resultados del clasificador

In [None]:
print(describe_custom_classifier["DocumentClassifierProperties"])

In [None]:
model_arn = describe_custom_classifier["DocumentClassifierProperties"]["DocumentClassifierArn"]
print(model_arn)

In [None]:
import os

# Obtenemos la URI de S3 de la salida del modelo y creamos la variable `job_key`
job_output = describe_custom_classifier["DocumentClassifierProperties"]["OutputDataConfig"]["S3Uri"]

path_prefix = f"s3://{bucket}/"

job_key = os.path.relpath(job_output, path_prefix)

print(job_output)
print(job_key)

## Descargamos los artefactos del modelo incluyendo las métricas de entrenamiento

In [None]:
s3 = boto3.resource("s3")
s3.Bucket(bucket).download_file(job_key, f"{temp_folder}/output.tar.gz")

#s3 = boto3.client("s3")

#s3.download_file(
#    Bucket=bucket,
#    Key=job_key,
#    Filename=f"{temp_folder}/output.tar.gz"
#)

Desempaquetamos el artefacto

In [None]:
!tar xvzf ./tmp/output.tar.gz

In [None]:
import json

with open("./output/confusion_matrix.json") as json_file:
    data = json.load(json_file)
print(json.dumps(data, indent=2, default=str))

In [None]:
from IPython.display import HTML, display
import tabulate

table = [
    ["", "1", "2", "3", "4", "5", "(Predicted)"],
    [
        "1",
        data["confusion_matrix"][0][0],
        data["confusion_matrix"][0][1],
        data["confusion_matrix"][0][2],
        data["confusion_matrix"][0][3],
        data["confusion_matrix"][0][4],
    ],
    [
        "2",
        data["confusion_matrix"][1][0],
        data["confusion_matrix"][1][1],
        data["confusion_matrix"][1][2],
        data["confusion_matrix"][1][3],
        data["confusion_matrix"][1][4],
    ],
    [
        "3",
        data["confusion_matrix"][2][0],
        data["confusion_matrix"][2][1],
        data["confusion_matrix"][2][2],
        data["confusion_matrix"][2][3],
        data["confusion_matrix"][2][4],
    ],
    [
        "4",
        data["confusion_matrix"][3][0],
        data["confusion_matrix"][3][1],
        data["confusion_matrix"][3][2],
        data["confusion_matrix"][3][3],
        data["confusion_matrix"][3][4],
    ],
    [
        "5",
        data["confusion_matrix"][4][0],
        data["confusion_matrix"][4][1],
        data["confusion_matrix"][4][2],
        data["confusion_matrix"][4][3],
        data["confusion_matrix"][4][4],
    ],
    ["(Actual)"],
]
display(HTML(tabulate.tabulate(table, tablefmt="html")))

## Desplegamos el endpoint

In [None]:
from time import gmtime, strftime, sleep

timestamp_suffix = strftime("%d-%H-%M-%S", gmtime())

comprehend_endpoint_name = "comprehend-inference-ep-" + timestamp_suffix

inference_endpoint_response = comprehend.create_endpoint(
    EndpointName=comprehend_endpoint_name, ModelArn=model_arn, DesiredInferenceUnits=1
)

In [None]:
comprehend_endpoint_arn = inference_endpoint_response["EndpointArn"]
print(comprehend_endpoint_arn)

## Pasamos variables a los siguientes notebooks

In [None]:
%store comprehend_training_job_arn
%store comprehend_endpoint_arn

# Release Resources

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>