# PyTorch (HuggingFace) BERT for Question Answering on SageMaker

TODO: Some kind of intro

## Set configurations and connnect to SDKs


In [1]:
# For easier dev of local modules:
%load_ext autoreload
%autoreload 2

# Python Built-Ins:
import json

# External Dependencies:
import boto3
import sagemaker
from sagemaker.pytorch.estimator import PyTorch as PyTorchEstimator
from sagemaker.pytorch.model import PyTorchModel

# Local Dependencies:
from util import demo


In [2]:
BUCKET_NAME = "2020-05-gym-bert"
%store BUCKET_NAME

SQUAD_V2 = False  # Whether to use V2 (including unanswerable questions)
%store SQUAD_V2

Stored 'BUCKET_NAME' (str)
Stored 'SQUAD_V2' (bool)


In [3]:
role = sagemaker.get_execution_role()
botosess = boto3.session.Session()
region = botosess.region_name
s3 = botosess.resource("s3")
bucket = s3.Bucket(BUCKET_NAME)
smclient = botosess.client("sagemaker")

## Fetch SQuAD Data

We'll fetch both the `train` and the `dev` datasets from SQuAD - which are distinct datasets without overlap:

In [4]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/rajpurkar/SQuAD-explorer/master/dataset"
version = "2.0" if SQUAD_V2 else "1.1"
train_raw_filename = f"train-v{version}.json"
dev_raw_filename = f"dev-v{version}.json"

!mkdir -p data/raw
!curl {DOWNLOAD_ROOT}/{train_raw_filename} --output ./data/raw/{train_raw_filename}
!curl {DOWNLOAD_ROOT}/{dev_raw_filename} --output ./data/raw/{dev_raw_filename}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 28.8M  100 28.8M    0     0   462k      0  0:01:03  0:01:03 --:--:-- 8125k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4740k  100 4740k    0     0  4593k      0  0:00:01  0:00:01 --:--:-- 4593k


## Curate datasets and load in to S3

Although SQuAD dev smaller and distinct from train, it's a bit large to make a performant validation dataset - so we'll split it in two to create separate "validation" and "test" datasets.

We assume there's no important correlations in the ordering of the dev dataset - so just take the first few documents as listed for validation and leave the remainder as test.

In [5]:
with open(f"data/raw/{dev_raw_filename}", "r") as f:
    dev_data = json.loads(f.read())

n_docs = len(dev_data["data"])
n_docs_validation = n_docs // 4  # Only use a quarter of the docs for validation, rest for test

val_data = {
    "data": dev_data["data"][:n_docs_validation],
    "version": version,
}
test_data = {
    "data": dev_data["data"][n_docs_validation:],
    "version": version,
}

In [6]:
train_filename = f"SQuAD-train-v{version}.json"
val_filename = f"SQuAD-validation-v{version}.json"

In [7]:
!cp data/raw/{train_raw_filename} data/{train_filename}
with open(f"data/{val_filename}", "w") as f:
    f.write(json.dumps(val_data))

In [8]:
bucket.Object(f"data/{train_filename}").upload_file(f"data/{train_filename}")
bucket.Object(f"data/{val_filename}").upload_file(f"data/{val_filename}")

## Configure Algorithm

Create input channels

In [9]:
train_channel = f"s3://{BUCKET_NAME}/data/{train_filename}"
val_channel = f"s3://{BUCKET_NAME}/data/{val_filename}"

Estimator

In [10]:
debugger_hook_config = sagemaker.debugger.DebuggerHookConfig(
    s3_output_path=f"s3://{BUCKET_NAME}/tensors",
    container_local_output_path="/var/tensors",
#     hook_parameters={
#         'key': 'value'
#     },
    collection_configs=[
#         sagemaker.debugger.CollectionConfig(
#             name="custom",
#             parameters={
#                 "key": "value"
#             }
#         ),
        sagemaker.debugger.CollectionConfig(name="gradients"),
    ]
)

tensorboard_output_config = sagemaker.debugger.TensorBoardOutputConfig(
    s3_output_path=f"s3://{BUCKET_NAME}/tensorboard",
    container_local_output_path="/var/tensorboard",
)

metric_definitions = [
    { "Name": "train:Loss", "Regex": r"Metrics:.* loss=(.*?);" },
    { "Name": "train:LearningRate", "Regex": r"Metrics:.* lr=(.*?);" },
    { "Name": "validation:Exact", "Regex": r"Metrics:.* eval_exact=(.*?);" },
    { "Name": "validation:F1", "Regex": r"Metrics:.* eval_f1=(.*?);" },
    { "Name": "validation:Total", "Regex": r"Metrics:.* eval_total=(.*?);" },
    { "Name": "validation:NoAnsExact", "Regex": r"Metrics:.* eval_NoAns_exact=(.*?);" },
    { "Name": "validation:NoAnsF1", "Regex": r"Metrics:.* eval_NoAns_f1=(.*?);" },
    { "Name": "validation:NoAnsTotal", "Regex": r"Metrics:.* eval_NoAns_total=(.*?);" },
    { "Name": "validation:BestExact", "Regex": r"Metrics:.* eval_best_exact=(.*?);" },
    { "Name": "validation:BestExactThresh", "Regex": r"Metrics:.* eval_best_exact_thresh=(.*?);" },
    { "Name": "validation:BestF1", "Regex": r"Metrics:.* eval_best_f1=(.*?);" },
    { "Name": "validation:BestF1Thresh", "Regex": r"Metrics:.* eval_best_f1_thresh=(.*?);" },
    { "Name": "validation:SecsPerSample", "Regex": r"Evaluation.* \((.*?) sec" },
]

estimator = PyTorchEstimator(
    entry_point="train.py",
    source_dir="src",

    base_job_name="bert-qna-short",
    checkpoint_s3_uri=f"s3://{BUCKET_NAME}/checkpoints",
    output_path=f"s3://{BUCKET_NAME}/jobs",

    framework_version="1.4.0",
    py_version="py3",

    role=role,
    train_instance_count=1,
    train_instance_type="ml.p3.2xlarge",
    train_max_run=int(1.5*60*60),

    # Checkpoint saving might be part-working but resume definitely isn't yet:
    #train_max_wait=60*60,
    #train_use_spot_instances=True,

    #debugger_hook_config=debugger_hook_config,
    #tensorboard_output_config=tensorboard_output_config,
    debugger_hook_config=False,
    metric_definitions=metric_definitions,

    hyperparameters={
        "checkpoint-interval": 200,
        "epochs": 2,  # as configured, max-steps is the limiting factor
        "has-unanswerable": "true" if SQUAD_V2 else "false",  # (SM doesn't like bool hyperparams)
        "log-interval": 200,
        "max-steps": 2000,
        "per-gpu-eval-batch-size": 16,
        "seed": 1337,
        #"log-level": "DEBUG",
    },
)

## Train

In [21]:
estimator.fit({
    "train": train_channel,
    "validation": val_channel,
})

2020-05-06 09:57:58 Starting - Starting the training job...
2020-05-06 09:58:00 Starting - Launching requested ML instances..

KeyboardInterrupt: 

## Deploy

In [25]:
#TODO: deploy from estimator

#predictor = estimator.deploy(
#    initial_instance_count=1,
#    instance_type="ml.p2.xlarge",
#)


In [None]:
model_path = estimator.latest_training_job.describe()["ModelArtifacts"]["S3ModelArtifacts"]
model = PyTorchModel(model_data=model_path, role=role, source_dir='src/', entry_point='src/inference.py', framework_version='1.4.0')
predictor = model.deploy(initial_instance_count=1, instance_type="ml.p2.xlarge")

In [40]:
def endpoint_answer_fetcher(context, question):
    endpoint_client = boto3.client('sagemaker-runtime')
    endpoint_name = model.name
    #endpoint_name = "pytorch-inference-2020-05-06-09-22-27-318"
    content_type = "application/json"
    payload = json.dumps({"question": question, "context": context}).encode('utf-8')
    response = endpoint_client.invoke_endpoint(
        EndpointName=endpoint_name, 
        ContentType=content_type,
        Accept=content_type,
        Body=payload
    )
    result = json.loads(response['Body'].read().decode("utf-8"))
    full = {
        "answer": result['answer'],
        "question": question,
        "context": context,
        "score": result['score']
    }
    return (result['start'], result['end']), json.dumps(full)
    
demo.squad_widget(test_data, endpoint_answer_fetcher)

VBox(children=(HTML(value='<p><b>🔮 SQuAD Explorer: 🔍</b> Select a document and paragraph; type a question and …

## Clean Up

In [None]:
predictor.delete_endpoint()